cuBLAS: dequantize simultaneously while copying memory

2023-04-27 20:16:32 +02:00 · 2023-04-27 20:16:32 +02:00 · d3fd04e92e
commit d3fd04e92e
parent b1ee8f59b4
3 changed files with 49 additions and 38 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -227,6 +227,25 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }

+dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q4_2:
+            return dequantize_row_q4_2_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_row_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_row_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_row_q8_0_cuda;
+        default:
+            return nullptr;
+    }
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16

@ -286,18 +305,22 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
 }

-cublasHandle_t g_cublasH = NULL;
-cudaStream_t g_cudaStream = NULL;
+cublasHandle_t g_cublasH = nullptr;
+cudaStream_t g_cudaStream = nullptr;
+cudaStream_t g_cudaStream2 = nullptr;
+cudaEvent_t g_cudaEvent = nullptr;

-void ggml_init_cublas(void) {
-    if (g_cublasH == NULL) {
+void ggml_init_cublas() {
+    if (g_cublasH == nullptr) {
        // create cublas handle, bind a stream
        CUBLAS_CHECK(cublasCreate(&g_cublasH));
-
        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
-
        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));

+        // create additional stream and event for synchronization
+        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
+        CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
+
        // configure logging to stdout
        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
    }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -26,7 +26,9 @@ extern "C" {
    } while (0)

 extern cublasHandle_t g_cublasH;
-extern cudaStream_t   g_cudaStream;
+extern cudaStream_t g_cudaStream;
+extern cudaStream_t g_cudaStream2;
+extern cudaEvent_t g_cudaEvent;

 void   ggml_init_cublas(void);
 void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
@ -41,6 +43,9 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st

 cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);

+typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
 #if defined(GGML_USE_CUBLAS)
        const float alpha = 1.0f;
        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
        const int y_ne = ne11 * ne10;
        const int d_ne = ne11 * ne01;

@ -8239,7 +8239,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(

        const float alpha = 1.0f;
        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
        const int y_ne = ne11 * ne10;
        const int d_ne = ne11 * ne01;

@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
 #if defined(GGML_USE_CUBLAS)
        const float alpha = 1.0f;
        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
        const int y_ne = ne11 * ne10;
        const int d_ne = ne11 * ne01;

        size_t x_size, y_size, d_size, q_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-        float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
+        float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
+        float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
+        float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
+        void  * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);

-        void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream)  = NULL;
-        if (type == GGML_TYPE_Q4_0) {
-            dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_1) {
-            dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_2) {
-            dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
-        }
-        else if (type == GGML_TYPE_Q5_0) {
-            dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
-        }
-        else if (type == GGML_TYPE_Q5_1) {
-            dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
-        }
-        else if (type == GGML_TYPE_Q8_0) {
-            dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
-        }
-        else {
-            GGML_ASSERT(false);
-        }
-#elif !defined(GGML_USE_CLBLAST)
+        const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
+        GGML_ASSERT(dequantize_row_q_cuda != NULL);
+#else
        float * const wdata = params->wdata;
        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 #endif
@ -8545,7 +8525,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                // copy and dequantize on device
                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));

-                dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
+                dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
                CUDA_CHECK(cudaGetLastError());
 #elif defined(GGML_USE_CLBLAST)
                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
@ -8565,6 +8545,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
                // copy data to device
                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));

+                // wait for dequantization
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
+
                // compute
                CUBLAS_CHECK(
                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,