In the function , add the cuda error bypass.

This commit is contained in:
liang 2023-05-25 23:03:14 +08:00
parent 7e4ea5beff
commit 0fc6170173

View file

@ -197,6 +197,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
// dequantize // dequantize
float & v0 = y[iybs + iqs + 0]; float & v0 = y[iybs + iqs + 0];
float & v1 = y[iybs + iqs + y_offset]; float & v1 = y[iybs + iqs + y_offset];
dequantize_kernel(vx, ib, iqs, v0, v1); dequantize_kernel(vx, ib, iqs, v0, v1);
} }
@ -414,6 +415,42 @@ static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr }; static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
void ggml_init_cublas() { void ggml_init_cublas() {
int device_id = 0;
cudaSetDevice(device_id);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
fprintf(stderr, "Device id: %d\n",
device_id);
fprintf(stderr, "Device name: %s\n",
prop.name);
fprintf(stderr, "Compute capability: %d.%d\n",
prop.major, prop.minor);
fprintf(stderr, "Amount of global memory: %g GB\n",
prop.totalGlobalMem / (1024.0 * 1024 * 1024));
fprintf(stderr, "Amount of constant memory: %g KB\n",
prop.totalConstMem / 1024.0);
fprintf(stderr, "Maximum grid size: %d %d %d\n",
prop.maxGridSize[0],
prop.maxGridSize[1], prop.maxGridSize[2]);
fprintf(stderr, "Maximum block size: %d %d %d\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2]);
fprintf(stderr, "Number of SMs: %d\n",
prop.multiProcessorCount);
fprintf(stderr, "Maximum amount of shared memory per block: %g KB\n",
prop.sharedMemPerBlock / 1024.0);
fprintf(stderr, "Maximum amount of shared memory per SM: %g KB\n",
prop.sharedMemPerMultiprocessor / 1024.0);
fprintf(stderr, "Maximum number of registers per block: %d K\n",
prop.regsPerBlock / 1024);
fprintf(stderr, "Maximum number of registers per SM: %d K\n",
prop.regsPerMultiprocessor / 1024);
fprintf(stderr, "Maximum number of threads per block: %d\n",
prop.maxThreadsPerBlock);
fprintf(stderr, "Maximum number of threads per SM: %d\n",
prop.maxThreadsPerMultiProcessor);
if (g_cublasH == nullptr) { if (g_cublasH == nullptr) {
// create streams // create streams
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
@ -442,10 +479,17 @@ void * ggml_cuda_host_malloc(size_t size) {
void * ptr = nullptr; void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size); cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL.
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err)); size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
else{
fprintf(stderr, "INFO: succeed to allocate %.2f MB of pinned memory\n",
size/1024.0/1024.0);
}
return ptr; return ptr;
} }