From fddbfe839aa80f52a09ff2675790a59f966b7df8 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 26 Feb 2024 14:31:18 +0200 Subject: [PATCH] iq4_xs: CUDA works - 133.2 t/s --- examples/quantize/quantize.cpp | 4 +- ggml-cuda.cu | 82 ++++++++++++++++++++++++---------- llama.cpp | 7 ++- 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 95f10b96a..7662ec80c 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -36,8 +36,8 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, - { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.5 bpw non-linear quantization", }, - { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.125 bpw non-linear quantization", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, + { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d42ac9181..1afdbe574 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -571,7 +571,7 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); -#define QR4_XS 2 +#define QR4_XS 4 #define QI4_XS (QK_K / (4*QR4_XS)) typedef struct { half d; @@ -5332,41 +5332,75 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( return d * (sumi1 + sumi2); } -// TODO static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - const block_iq4_xs * bq = (const block_iq4_xs *) vbq; - +#if QK_K == 256 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; - const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + // iqs is 0...15 + const int ib32 = iqs/2; + const int il = iqs%2; + const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); int v1, v2; int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { - const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); - get_int_from_table_16(aux, values, v1, v2); - sumi1 = __dp4a(v1, q8[l+0], sumi1); - sumi2 = __dp4a(v2, q8[l+4], sumi2); + for (int j = 0; j < 2; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = __dp4a(v1, q8[j+0], sumi1); + sumi2 = __dp4a(v2, q8[j+4], sumi2); } - -#else - const uint8_t * q4 = bq->qs + 4*iqs; - const int8_t * q8 = bq8_1->qs + 4*iqs; - - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { - sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; - sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; - } -#endif - const float d = (float)bq->d * __low2float(bq8_1->ds); return d * (sumi1 + sumi2); +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif } +//// TODO +//static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( +// const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +// +// const block_iq4_xs * bq = (const block_iq4_xs *) vbq; +// +//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +// const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; +// const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; +// +// const uint8_t * values = (const uint8_t *)kvalues_iq4nl; +// +// int v1, v2; +// int sumi1 = 0, sumi2 = 0; +// for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { +// const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); +// get_int_from_table_16(aux, values, v1, v2); +// sumi1 = __dp4a(v1, q8[l+0], sumi1); +// sumi2 = __dp4a(v2, q8[l+4], sumi2); +// } +// +//#else +// const uint8_t * q4 = bq->qs + 4*iqs; +// const int8_t * q8 = bq8_1->qs + 4*iqs; +// +// int sumi1 = 0, sumi2 = 0; +// for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { +// sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; +// sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; +// } +//#endif +// const float d = (float)bq->d * __low2float(bq8_1->ds); +// return d * (sumi1 + sumi2); +//} + template static __device__ __forceinline__ void mul_mat_q( @@ -9416,7 +9450,7 @@ static void ggml_cuda_op_mul_mat_vec_q( (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_IQ4_XS: - mul_mat_vec_q_cuda + mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_IQ3_S: diff --git a/llama.cpp b/llama.cpp index 00fd3ffb4..1d460f370 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2941,7 +2941,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.125 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; @@ -10903,9 +10903,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS)) { - if (!qs.has_imatrix) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {