k_quants: faster Q2_K on older GPUs
It looks like I didn't need to change anything compared to what we already had, so this is just adding clarifying comments. But I now measure 36.3 ms/tok on the GTX-1660, instead fo the 47.2 ms/tok that I have written in the faster k-quants PR.
This commit is contained in:
parent
be6f8b9ee7
commit
d6daebcb0c
1 changed files with 6 additions and 6 deletions
12
ggml-cuda.cu
12
ggml-cuda.cu
|
@ -483,15 +483,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
|
|
||||||
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
||||||
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
const int step = 16/K_QUANTS_PER_ITERATION;
|
const int step = 16/K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||||
const int in = tid - step*im; // 0...7
|
const int in = tid - step*im; // 0...15 or 0...7
|
||||||
|
|
||||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
||||||
const int q_offset = 32*im + l0;
|
const int q_offset = 32*im + l0;
|
||||||
const int s_offset = 8*im;
|
const int s_offset = 8*im;
|
||||||
const int y_offset = 128*im + l0;
|
const int y_offset = 128*im + l0;
|
||||||
|
@ -1266,7 +1266,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2;
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue