From 95cf9597aad1a899295578f2e0d706c21fcac473 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 19 Apr 2023 23:01:53 +0200 Subject: [PATCH] Fix possible synchronization issue --- ggml-cuda.cu | 20 ++++++++++---------- ggml-cuda.h | 6 +++--- ggml.c | 11 ++++++----- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c02831d23..7cd116602 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,6 +1,6 @@ #include -#include "ggml-cuda.h" #include +#include "ggml-cuda.h" typedef uint16_t ggml_fp16_t; static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -31,7 +31,7 @@ static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 static __global__ void dequantize_block_q4_0(const void * vx, float * y) { const block_q4_0 * x = (const block_q4_0 *) vx; - int i = blockIdx.x; + const int i = blockIdx.x; const float d = x[i].d; @@ -54,7 +54,7 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) { static __global__ void dequantize_block_q4_1(const void * vx, float * y) { const block_q4_1 * x = (const block_q4_1 *) vx; - int i = blockIdx.x; + const int i = blockIdx.x; const float d = x[i].d; const float m = x[i].m; @@ -78,7 +78,7 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) { static __global__ void dequantize_block_q4_2(const void * vx, float * y) { const block_q4_2 * x = (const block_q4_2 *) vx; - int i = blockIdx.x; + const int i = blockIdx.x; const float d = x[i].d; @@ -99,18 +99,18 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) { } extern "C" { - __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k) { + __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_0; - dequantize_block_q4_0<<>>(vx, y); + dequantize_block_q4_0<<>>(vx, y); } - __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k) { + __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_1; - dequantize_block_q4_1<<>>(vx, y); + dequantize_block_q4_1<<>>(vx, y); } - __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k) { + __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_2; - dequantize_block_q4_2<<>>(vx, y); + dequantize_block_q4_2<<>>(vx, y); } } diff --git a/ggml-cuda.h b/ggml-cuda.h index ae37ee9a1..646caafc6 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,9 +2,9 @@ extern "C" { #endif -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k); -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k); -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k); +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); #ifdef __cplusplus } diff --git a/ggml.c b/ggml.c index 53a58f3c2..ece605e70 100644 --- a/ggml.c +++ b/ggml.c @@ -178,6 +178,7 @@ static void init_cublas(void) { CUBLAS_CHECK(cublasCreate(&cublasH)); CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking)); + CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream)); // configure logging to stdout @@ -7758,7 +7759,6 @@ static void ggml_compute_forward_mul_mat_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -7770,6 +7770,7 @@ static void ggml_compute_forward_mul_mat_f32( } } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D)); @@ -7982,7 +7983,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -8000,6 +8000,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D)); @@ -8185,7 +8186,7 @@ static void ggml_compute_forward_mul_mat_q_f32( CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); CUDA_CHECK(cudaMalloc((void **)(&d_Q), GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type])); - dequantize_row_q_t dequantize_row_q_cuda = NULL; + void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL; if (type == GGML_TYPE_Q4_0) { dequantize_row_q_cuda = dequantize_row_q4_0_cuda; } @@ -8215,7 +8216,7 @@ static void ggml_compute_forward_mul_mat_q_f32( cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02, GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, cudaStream)); - dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00); + dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, cudaStream); CUDA_CHECK(cudaGetLastError()); #else { @@ -8243,7 +8244,6 @@ static void ggml_compute_forward_mul_mat_q_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -8256,6 +8256,7 @@ static void ggml_compute_forward_mul_mat_q_f32( } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D));