k_quants: hopefully much faster Q4_K on older GPUs
On the GTX-1660 that I have available to represent "old GPUs", token prediction drops from 65.5 ms/tok to 41.5 ms/tok!
This commit is contained in:
parent
8596af4277
commit
1677059ba1
1 changed files with 16 additions and 10 deletions
26
ggml-cuda.cu
26
ggml-cuda.cu
|
@ -618,22 +618,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int row = blockIdx.x;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
if (row > nrows) return;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
const int tid = threadIdx.x/2; // 0...15
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix = threadIdx.x%2;
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
const int il = tid/4; // 0...3
|
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
||||||
const int ir = tid - 4*il;// 0...3
|
|
||||||
const int n = 4;
|
const int il = tid/step; // 0...3
|
||||||
|
const int ir = tid - step*il; // 0...7 or 0...3
|
||||||
|
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
||||||
|
|
||||||
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||||
const int in = il%2;
|
const int in = il%2;
|
||||||
|
@ -649,7 +652,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
const uint8_t * q1 = x[i].qs + q_offset;
|
const uint8_t * q1 = x[i].qs + q_offset;
|
||||||
const uint8_t * q2 = q1 + 64;
|
const uint8_t * q2 = q1 + 64;
|
||||||
|
@ -1275,8 +1278,11 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 1, 1);
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
dequantize_mul_mat_vec_q4_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(32, ny, 1);
|
||||||
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue