From 9830871d0f5c13516c5d7bcfb6ea9ec1a34652c7 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 13 Jun 2023 16:15:13 +0800 Subject: [PATCH] pulled all Occam's fixes and the kquants are all working now --- ggml-opencl.cpp | 44 +++++++++++++++++++------------------------- llama.cpp | 12 ------------ 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 82156e7f4..3ab5b63b4 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -16,7 +16,7 @@ #include "ggml.h" -#define CL_DMMV_BLOCK_SIZE 32; +#define CL_DMMV_BLOCK_SIZE 32 #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE( @@ -362,18 +362,9 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i __global const uint8_t * hm = x[ib].hmask + l; const int8_t * s = (const int8_t *)utmp + 8*n; - aux[0] |= x[ib].scales[0]; - aux[0] |= x[ib].scales[1] << 8; - aux[0] |= x[ib].scales[2] << 16; - aux[0] |= x[ib].scales[3] << 24; - aux[1] |= x[ib].scales[4]; - aux[1] |= x[ib].scales[5] << 8; - aux[1] |= x[ib].scales[6] << 16; - aux[1] |= x[ib].scales[7] << 24; - aux[2] |= x[ib].scales[8]; - aux[2] |= x[ib].scales[9] << 8; - aux[2] |= x[ib].scales[10] << 16; - aux[2] |= x[ib].scales[11] << 24; + aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24; + aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24; + aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24; utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); @@ -471,7 +462,7 @@ void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int i const float d = vload_half(0, &x[ib].d); __global const uint8_t * ql = x[ib].ql + 64*ip + il; - const uint8_t * qh = x[ib].qh + 32*ip + il; + __global const uint8_t * qh = x[ib].qh + 32*ip + il; __global const int8_t * sc = x[ib].scales + is; *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) @@ -515,7 +506,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; + const int row = get_group_id(0); const int tid = get_local_id(0); const uint qk = QUANT_K; @@ -557,11 +548,11 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; + const int row = get_group_id(0); const int tid = get_local_id(0); const int iter_stride = 256; - const int vals_per_iter = iter_stride; + const int vals_per_iter = iter_stride / block_size; const int num_blocks_per_row = ncols / 256; const int ib0 = row*num_blocks_per_row; @@ -575,7 +566,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float // dequantize float v; - dot_kernel(x, ib, iqs, y + iybs, &v); + DOT_KERNEL(x, ib, iqs, y + iybs, &v); tmp[tid] += v; } @@ -656,6 +647,10 @@ std::array mul_str_values = { "mul_f32", "float" }; +std::array dmmv_k_str_keys = { + "KERNEL_NAME", "X_TYPE", "DOT_KERNEL" +}; + std::array dmmv_k_str_values = { "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", @@ -693,13 +688,12 @@ std::string generate_kernels() { } src << mul_kernel << '\n'; } - for (size_t i = 0; i < dmmv_k_str_values.size(); i += 3) { - std::string dmmv_kernel = dequant_mul_mat_vec_k_template; - //just apply quick template fn name replacement for the K quant DMMVs since sizes are known - replace(dmmv_kernel, "KERNEL_NAME", dmmv_k_str_values[i]); - replace(dmmv_kernel, "X_TYPE", dmmv_k_str_values[i + 1]); - replace(dmmv_kernel, "dot_kernel", dmmv_k_str_values[i + 2]); - src << dmmv_kernel << '\n'; + for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) { + std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template; + for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) { + replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]); + } + src << dmmv_k_kernel << '\n'; } return src.str(); diff --git a/llama.cpp b/llama.cpp index eff3a6bb5..cbdd9ccfb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1074,18 +1074,6 @@ static void llama_model_load_internal( } } - #if defined(GGML_USE_CLBLAST) - if (file_version == LLAMA_FILE_VERSION_GGJT_V3) { - if (hparams.ftype >= LLAMA_FTYPE_MOSTLY_Q2_K && hparams.ftype <= LLAMA_FTYPE_MOSTLY_Q6_K) { - if(n_gpu_layers>0) - { - n_gpu_layers = 0; - printf("\n===\nCLBlast cannot offload layers for K-Quants!\nPlease select a q4_0, q4_0, q5_0 or q5_1 format instead!\n=====\n"); - } - } - } - #endif - if (vocab_only) { return; }