cuBLAS: dequantize simultaneously while copying memory
This commit is contained in:
parent
b1ee8f59b4
commit
d3fd04e92e
3 changed files with 49 additions and 38 deletions
35
ggml-cuda.cu
35
ggml-cuda.cu
|
@ -227,6 +227,25 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
|
|||
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q4_2:
|
||||
return dequantize_row_q4_2_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_row_q5_0_cuda;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_row_q5_1_cuda;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_row_q8_0_cuda;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// buffer pool for cuda
|
||||
#define MAX_CUDA_BUFFERS 16
|
||||
|
||||
|
@ -286,18 +305,22 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|||
CUDA_CHECK(cudaFree(ptr));
|
||||
}
|
||||
|
||||
cublasHandle_t g_cublasH = NULL;
|
||||
cudaStream_t g_cudaStream = NULL;
|
||||
cublasHandle_t g_cublasH = nullptr;
|
||||
cudaStream_t g_cudaStream = nullptr;
|
||||
cudaStream_t g_cudaStream2 = nullptr;
|
||||
cudaEvent_t g_cudaEvent = nullptr;
|
||||
|
||||
void ggml_init_cublas(void) {
|
||||
if (g_cublasH == NULL) {
|
||||
void ggml_init_cublas() {
|
||||
if (g_cublasH == nullptr) {
|
||||
// create cublas handle, bind a stream
|
||||
CUBLAS_CHECK(cublasCreate(&g_cublasH));
|
||||
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
|
||||
|
||||
// create additional stream and event for synchronization
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
|
||||
|
||||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
|
||||
}
|
||||
|
|
|
@ -26,7 +26,9 @@ extern "C" {
|
|||
} while (0)
|
||||
|
||||
extern cublasHandle_t g_cublasH;
|
||||
extern cudaStream_t g_cudaStream;
|
||||
extern cudaStream_t g_cudaStream;
|
||||
extern cudaStream_t g_cudaStream2;
|
||||
extern cudaEvent_t g_cudaEvent;
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
|
||||
|
@ -41,6 +43,9 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
|
|||
|
||||
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
|
||||
|
||||
typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
||||
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
45
ggml.c
45
ggml.c
|
@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
#if defined(GGML_USE_CUBLAS)
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne10;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
|
@ -8239,7 +8239,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne10;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
|
@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
#if defined(GGML_USE_CUBLAS)
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne10;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size, q_size;
|
||||
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
|
||||
float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
|
||||
|
||||
void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q4_1) {
|
||||
dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q4_2) {
|
||||
dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q5_0) {
|
||||
dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q5_1) {
|
||||
dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q8_0) {
|
||||
dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
#elif !defined(GGML_USE_CLBLAST)
|
||||
const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
|
||||
GGML_ASSERT(dequantize_row_q_cuda != NULL);
|
||||
#else
|
||||
float * const wdata = params->wdata;
|
||||
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
||||
#endif
|
||||
|
@ -8545,7 +8525,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
// copy and dequantize on device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));
|
||||
|
||||
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
|
||||
dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
||||
|
@ -8565,6 +8545,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
// copy data to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
|
||||
|
||||
// wait for dequantization
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
|
||||
|
||||
// compute
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue