From 0fc6170173b0df3500c2f104a3a3797cc14f7147 Mon Sep 17 00:00:00 2001 From: liang Date: Thu, 25 May 2023 23:03:14 +0800 Subject: [PATCH] In the function , add the cuda error bypass. --- ggml-cuda.cu | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 35d2e457c..979a46b63 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -197,6 +197,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) // dequantize float & v0 = y[iybs + iqs + 0]; float & v1 = y[iybs + iqs + y_offset]; + dequantize_kernel(vx, ib, iqs, v0, v1); } @@ -414,6 +415,42 @@ static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr }; static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr }; void ggml_init_cublas() { + int device_id = 0; + cudaSetDevice(device_id); + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + + fprintf(stderr, "Device id: %d\n", + device_id); + fprintf(stderr, "Device name: %s\n", + prop.name); + fprintf(stderr, "Compute capability: %d.%d\n", + prop.major, prop.minor); + fprintf(stderr, "Amount of global memory: %g GB\n", + prop.totalGlobalMem / (1024.0 * 1024 * 1024)); + fprintf(stderr, "Amount of constant memory: %g KB\n", + prop.totalConstMem / 1024.0); + fprintf(stderr, "Maximum grid size: %d %d %d\n", + prop.maxGridSize[0], + prop.maxGridSize[1], prop.maxGridSize[2]); + fprintf(stderr, "Maximum block size: %d %d %d\n", + prop.maxThreadsDim[0], prop.maxThreadsDim[1], + prop.maxThreadsDim[2]); + fprintf(stderr, "Number of SMs: %d\n", + prop.multiProcessorCount); + fprintf(stderr, "Maximum amount of shared memory per block: %g KB\n", + prop.sharedMemPerBlock / 1024.0); + fprintf(stderr, "Maximum amount of shared memory per SM: %g KB\n", + prop.sharedMemPerMultiprocessor / 1024.0); + fprintf(stderr, "Maximum number of registers per block: %d K\n", + prop.regsPerBlock / 1024); + fprintf(stderr, "Maximum number of registers per SM: %d K\n", + prop.regsPerMultiprocessor / 1024); + fprintf(stderr, "Maximum number of threads per block: %d\n", + prop.maxThreadsPerBlock); + fprintf(stderr, "Maximum number of threads per SM: %d\n", + prop.maxThreadsPerMultiProcessor); if (g_cublasH == nullptr) { // create streams for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { @@ -442,10 +479,17 @@ void * ggml_cuda_host_malloc(size_t size) { void * ptr = nullptr; cudaError_t err = cudaMallocHost((void **) &ptr, size); if (err != cudaSuccess) { + // The allocation error can be bypassed. A null ptr will assigned out of this function. + // This can fixed the OOM error in WSL. + cudaGetLastError(); fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; } + else{ + fprintf(stderr, "INFO: succeed to allocate %.2f MB of pinned memory\n", + size/1024.0/1024.0); + } return ptr; }