From a3505fac648cf57a282652acf26db1d77db9e19e Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Fri, 28 Jul 2023 14:21:25 +0200 Subject: [PATCH] faster q8_1 loading --- ggml-cuda.cu | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0730cf530..7ac0c28cd 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2462,8 +2462,10 @@ static __global__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; + __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; - __shared__ half2 tile_y_ds[(WARP_SIZE) * (qr*WARP_SIZE/QI8_1)]; + __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; float sum[2][4] = {0.0f}; @@ -2474,8 +2476,6 @@ static __global__ void mul_mat_q( i + tid_y, tid_x, blocks_per_row_x); } - const int iqsy = sizeof(int) * (tid_x % QI8_1); - for (int ir = 0; ir < qr; ++ir) { const int kqs = ir*WARP_SIZE + tid_x; const int kby = kqs / QI8_1; @@ -2485,11 +2485,17 @@ static __global__ void mul_mat_q( const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby]; - tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = *((int *) &by0->qs[iqsy]); - tile_y_ds[(tid_y + i) * (qr*WARP_SIZE/QI8_1) + kby] = by0->ds; + tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); } } + for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { + const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; + const int kby = tid_x % blocks_per_tile_y_col; + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; + } + __syncthreads(); for (int k = 0; k < WARP_SIZE/vdr; ++k) {