Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
This commit is contained in:
parent
c93cce3a45
commit
a3c0673089
1 changed files with 9 additions and 13 deletions
18
ggml-cuda.cu
18
ggml-cuda.cu
|
@ -255,8 +255,8 @@ static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs
|
||||||
uint8_t m = 1 << (4*n + j);
|
uint8_t m = 1 << (4*n + j);
|
||||||
|
|
||||||
const float d = x[ib].d;
|
const float d = x[ib].d;
|
||||||
const uint8_t * q = x[ib].qs + 32*n;
|
const uint8_t * q = x[ib].qs + 32*n + l;
|
||||||
const uint8_t * hm = x[ib].hmask;
|
const uint8_t * hm = x[ib].hmask + l;
|
||||||
|
|
||||||
int8_t us = is < 4 ? (x[ib].scales[is-0] & 0xF) | (((x[ib].scales[is+8] >> 0) & 3) << 4) :
|
int8_t us = is < 4 ? (x[ib].scales[is-0] & 0xF) | (((x[ib].scales[is+8] >> 0) & 3) << 4) :
|
||||||
is < 8 ? (x[ib].scales[is-0] & 0xF) | (((x[ib].scales[is+4] >> 2) & 3) << 4) :
|
is < 8 ? (x[ib].scales[is-0] & 0xF) | (((x[ib].scales[is+4] >> 2) & 3) << 4) :
|
||||||
|
@ -266,8 +266,8 @@ static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int k = 0; k < 8; ++k) {
|
for (int k = 0; k < 8; ++k) {
|
||||||
int8_t ql = (q[l + k] >> shift) & 3;
|
int8_t ql = (q[k] >> shift) & 3;
|
||||||
sum += y[iqs + k] * (ql - ((hm[l + k] & m) ? 0 : 4));
|
sum += y[iqs + k] * (ql - ((hm[k] & m) ? 0 : 4));
|
||||||
}
|
}
|
||||||
result = sum * scale;
|
result = sum * scale;
|
||||||
}
|
}
|
||||||
|
@ -362,14 +362,10 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y
|
||||||
const int iqs = col%QK_K; // x quant index
|
const int iqs = col%QK_K; // x quant index
|
||||||
const int iybs = col - col%QK_K; // y block start index
|
const int iybs = col - col%QK_K; // y block start index
|
||||||
|
|
||||||
// processing >2 values per i iter is faster for fast GPUs
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < vals_per_iter; j += 8) {
|
|
||||||
float v;
|
float v;
|
||||||
dot_kernel(vx, ib, iqs + j, y + iybs, v);
|
dot_kernel(vx, ib, iqs, y + iybs, v);
|
||||||
tmp += v;
|
tmp += v;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -460,8 +456,8 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
const dim3 block_dims(WARP_SIZE, 2, 1);
|
||||||
dequantize_mul_mat_vec_k<vec_dot_q3_K><<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue