From 12d43443b2a328e2a867c6bda2a19edb46082af6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Jun 2023 22:53:07 +0300 Subject: [PATCH] ggml : rename k_quants -> ggml-quants-k, use lowercase in code --- CMakeLists.txt | 4 +- Makefile | 18 ++--- ggml-cuda.cu | 120 ++++++++++++++++---------------- k_quants.c => ggml-quants-k.c | 126 +++++++++++++++++----------------- k_quants.h => ggml-quants-k.h | 80 ++++++++++----------- ggml.c | 96 +++++++++++++------------- 6 files changed, 222 insertions(+), 222 deletions(-) rename k_quants.c => ggml-quants-k.c (95%) rename k_quants.h => ggml-quants-k.h (52%) diff --git a/CMakeLists.txt b/CMakeLists.txt index de51c667c..1d375e226 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -370,8 +370,8 @@ endif() add_library(ggml OBJECT ggml.c ggml.h - k_quants.h - k_quants.c + ggml-quants-k.h + ggml-quants-k.c ${GGML_CUDA_SOURCES} ${GGML_OPENCL_SOURCES}) diff --git a/Makefile b/Makefile index 28548f7e9..98d1bdbf7 100644 --- a/Makefile +++ b/Makefile @@ -210,10 +210,10 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h +ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h $(CC) $(CFLAGS) -c $< -o $@ -k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h +ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h @@ -232,25 +232,25 @@ clean: # Examples # -main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o k_quants.o $(OBJS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o k_quants.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) @@ -272,7 +272,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ -vdot: pocs/vdot/vdot.cpp ggml.o k_quants.o $(OBJS) +vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) .PHONY: tests clean diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b1b1e6512..5385e0120 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -94,24 +94,24 @@ typedef struct { uint8_t qs[QK_K/4]; // quants half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins -} block_q2_K; -static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); +} block_q2_k; +static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); typedef struct { uint8_t hmask[QK_K/8]; uint8_t qs[QK_K/4]; // nibbles / quants uint8_t scales[3*QK_K/64]; half d; -} block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); +} block_q3_k; +static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +} block_q4_k; +static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); typedef struct { half d; // super-block scale for quantized scales @@ -119,16 +119,16 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +} block_q5_k; +static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales half d; // delta -} block_q6_K; -static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); +} block_q6_k; +static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); #define WARP_SIZE 32 @@ -233,7 +233,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int //================================== k-quants -static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { const int i = blockIdx.x; const int tid = threadIdx.x; @@ -241,7 +241,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { const int l = tid - 32*n; const int is = 8*n + l/16; - const block_q2_K * x = (const block_q2_K *) vx; + const block_q2_k * x = (const block_q2_k *) vx; const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; @@ -255,9 +255,9 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q2_K * x = (const block_q2_K *) vx; + const block_q2_k * x = (const block_q2_k *) vx; // if n is 0, we want to do the lower 128, else the upper 128, // covering y[l+0], y[l+32], y[l+64], y[l+96] and @@ -286,7 +286,7 @@ static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { int r = threadIdx.x/4; int i = blockIdx.x; @@ -296,7 +296,7 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int n = tid / 4; int j = tid - 4*n; - const block_q3_K * x = (const block_q3_K *) vx; + const block_q3_k * x = (const block_q3_k *) vx; uint8_t m = 1 << (4*n + j); int is = 8*n + 2*j + is0; @@ -317,9 +317,9 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q3_K * x = (const block_q3_K *) vx; + const block_q3_k * x = (const block_q3_k *) vx; const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -371,8 +371,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } } -static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { - const block_q4_K * x = (const block_q4_K *) vx; +static __global__ void dequantize_block_q4_k(const void * vx, float * yy) { + const block_q4_k * x = (const block_q4_k *) vx; const int i = blockIdx.x; @@ -408,9 +408,9 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { } } -static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q4_K * x = (const block_q4_K *) vx; + const block_q4_k * x = (const block_q4_k *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -440,8 +440,8 @@ static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { - const block_q5_K * x = (const block_q5_K *) vx; +static __global__ void dequantize_block_q5_k(const void * vx, float * yy) { + const block_q5_k * x = (const block_q5_k *) vx; const int i = blockIdx.x; @@ -473,9 +473,9 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } -static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q5_K * x = (const block_q5_K *) vx; + const block_q5_k * x = (const block_q5_k *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -510,8 +510,8 @@ static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { - const block_q6_K * x = (const block_q6_K *) vx; +static __global__ void dequantize_block_q6_k(const void * vx, float * yy) { + const block_q6_k * x = (const block_q6_k *) vx; const int i = blockIdx.x; @@ -535,9 +535,9 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } -static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q6_K * x = (const block_q6_K *) vx; + const block_q6_k * x = (const block_q6_k *) vx; const int ip = iqs / 128; // 0 or 1 const int il = (iqs - 128*ip)/8; // 0...15 @@ -701,29 +701,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q2_K<<>>(vx, y); + dequantize_block_q2_k<<>>(vx, y); } -static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q3_K<<>>(vx, y); + dequantize_block_q3_k<<>>(vx, y); } -static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q4_K<<>>(vx, y); + dequantize_block_q4_k<<>>(vx, y); } -static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q5_K<<>>(vx, y); + dequantize_block_q5_k<<>>(vx, y); } -static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q6_K<<>>(vx, y); + dequantize_block_q6_k<<>>(vx, y); } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -766,35 +766,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2; const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<>>(vx, y, dst, ncols); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -823,15 +823,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q8_0: return dequantize_row_q8_0_cuda; case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_cuda; + return dequantize_row_q2_k_cuda; case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_cuda; + return dequantize_row_q3_k_cuda; case GGML_TYPE_Q4_K: - return dequantize_row_q4_K_cuda; + return dequantize_row_q4_k_cuda; case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_cuda; + return dequantize_row_q5_k_cuda; case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_cuda; + return dequantize_row_q6_k_cuda; case GGML_TYPE_F16: return convert_fp16_to_fp32_cuda; default: @@ -852,15 +852,15 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t case GGML_TYPE_Q8_0: return dequantize_mul_mat_vec_q8_0_cuda; case GGML_TYPE_Q2_K: - return dequantize_mul_mat_vec_q2_K_cuda; + return dequantize_mul_mat_vec_q2_k_cuda; case GGML_TYPE_Q3_K: - return dequantize_mul_mat_vec_q3_K_cuda; + return dequantize_mul_mat_vec_q3_k_cuda; case GGML_TYPE_Q4_K: - return dequantize_mul_mat_vec_q4_K_cuda; + return dequantize_mul_mat_vec_q4_k_cuda; case GGML_TYPE_Q5_K: - return dequantize_mul_mat_vec_q5_K_cuda; + return dequantize_mul_mat_vec_q5_k_cuda; case GGML_TYPE_Q6_K: - return dequantize_mul_mat_vec_q6_K_cuda; + return dequantize_mul_mat_vec_q6_k_cuda; case GGML_TYPE_F16: return convert_mul_mat_vec_f16_cuda; default: diff --git a/k_quants.c b/ggml-quants-k.c similarity index 95% rename from k_quants.c rename to ggml-quants-k.c index 4d524494d..dec00d371 100644 --- a/k_quants.c +++ b/ggml-quants-k.c @@ -1,4 +1,4 @@ -#include "k_quants.h" +#include "ggml-quants-k.h" #include "ggml.h" #include @@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -341,7 +341,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -374,26 +374,26 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_K_reference(x, vy, k); +void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) { + quantize_row_q2_k_reference(x, vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; - quantize_row_q2_K_reference(src + j, y, k); + block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K; + quantize_row_q2_k_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q2_K)); + return (n/QK_K*sizeof(block_q2_k)); } //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -469,7 +469,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } } -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); assert(QK_K == 256); const int nb = k / QK_K; @@ -520,26 +520,26 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_K_reference(x, vy, k); +void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) { + quantize_row_q3_k_reference(x, vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; - quantize_row_q3_K_reference(src + j, y, k); + block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K; + quantize_row_q3_k_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q3_K)); + return (n/QK_K*sizeof(block_q3_k)); } // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -604,7 +604,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -630,26 +630,26 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; - quantize_row_q4_K_reference(x, y, k); + block_q4_k * restrict y = vy; + quantize_row_q4_k_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO: collect histograms for (int j = 0; j < nb; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; - quantize_row_q4_K_reference(src + j, y, k); + block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K; + quantize_row_q4_k_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q4_K)); + return (n/QK_K*sizeof(block_q4_k)); } // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -731,7 +731,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -759,26 +759,26 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; - quantize_row_q5_K_reference(x, y, k); + block_q5_k * restrict y = vy; + quantize_row_q5_k_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; for (int j = 0; j < nb; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; - quantize_row_q5_K_reference(src + j, y, k); + block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K; + quantize_row_q5_k_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q5_K)); + return (n/QK_K*sizeof(block_q5_k)); } // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -842,7 +842,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -875,28 +875,28 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; - quantize_row_q6_K_reference(x, y, k); + block_q6_k * restrict y = vy; + quantize_row_q6_k_reference(x, y, k); } -size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO for (int j = 0; j < nb; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; - quantize_row_q6_K_reference(src + j, y, k); + block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K; + quantize_row_q6_k_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q6_K)); + return (n/QK_K*sizeof(block_q6_k)); } //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -933,7 +933,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -944,8 +944,8 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_K_reference(x, y, k); +void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) { + quantize_row_q8_k_reference(x, y, k); } //===================================== Dot ptoducts ================================= @@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_k * restrict x = vx; + const block_q8_k * restrict y = vy; const int nb = n / QK_K; @@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_k * restrict x = vx; + const block_q8_k * restrict y = vy; const int nb = n / QK_K; @@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri } -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_k * restrict x = vx; + const block_q8_k * restrict y = vy; const int nb = n / QK_K; @@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_k * restrict x = vx; + const block_q8_k * restrict y = vy; const int nb = n / QK_K; @@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_k * restrict x = vx; + const block_q8_k * restrict y = vy; const int nb = n / QK_K; diff --git a/k_quants.h b/ggml-quants-k.h similarity index 52% rename from k_quants.h rename to ggml-quants-k.h index 10a0baac7..d6f06013b 100644 --- a/k_quants.h +++ b/ggml-quants-k.h @@ -22,8 +22,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t dmin; // super-block scale for quantized mins -} block_q2_K; -static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); +} block_q2_k; +static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); // 3-bit quantization // weight is represented as x = a * q @@ -34,8 +34,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits ggml_fp16_t d; // super-block scale -} block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); +} block_q3_k; +static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); // 4-bit quantization // 16 blocks of 32 elements each @@ -46,8 +46,8 @@ typedef struct { ggml_fp16_t dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +} block_q4_k; +static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); // 5-bit quantization // 16 blocks of 32 elements each @@ -59,8 +59,8 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +} block_q5_k; +static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); // 6-bit quantization // weight is represented as x = a * q @@ -71,52 +71,52 @@ typedef struct { uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits ggml_fp16_t d; // super-block scale -} block_q6_K; -static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding"); +} block_q6_k; +static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding"); // This is only used for intermediate quantization and dot products typedef struct { float d; // delta int8_t qs[QK_K]; // quants int16_t bsums[QK_K/16]; // sum of quants in groups of 16 -} block_q8_K; -static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); +} block_q8_k; +static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding"); // Quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); +void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k); +void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k); +void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k); +void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k); +void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k); +void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k); -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q2_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q3_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q6_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_k(const float * restrict x, void * restrict y, int k); // Dequantization -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k); +void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k); +void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k); +void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k); +void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k); +void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k); // Dot product -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); // Quantization with histogram collection -size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist); diff --git a/ggml.c b/ggml.c index 8de84b0e5..2bb45b09c 100644 --- a/ggml.c +++ b/ggml.c @@ -2,7 +2,7 @@ #define _GNU_SOURCE #include "ggml.h" -#include "k_quants.h" +#include "ggml-quants-k.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -1567,43 +1567,43 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, - .quantize_row_q = quantize_row_q2_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q2_K_q8_K, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k, + .quantize_row_q = quantize_row_q2_k, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference, + .quantize_row_q_dot = quantize_row_q8_k, + .vec_dot_q = ggml_vec_dot_q2_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, - .quantize_row_q = quantize_row_q3_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q3_K_q8_K, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k, + .quantize_row_q = quantize_row_q3_k, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference, + .quantize_row_q_dot = quantize_row_q8_k, + .vec_dot_q = ggml_vec_dot_q3_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, - .quantize_row_q = quantize_row_q4_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q4_K_q8_K, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k, + .quantize_row_q = quantize_row_q4_k, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference, + .quantize_row_q_dot = quantize_row_q8_k, + .vec_dot_q = ggml_vec_dot_q4_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, - .quantize_row_q = quantize_row_q5_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q5_K_q8_K, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k, + .quantize_row_q = quantize_row_q5_k, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference, + .quantize_row_q_dot = quantize_row_q8_k, + .vec_dot_q = ggml_vec_dot_q5_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, - .quantize_row_q = quantize_row_q6_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q6_K_q8_K, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k, + .quantize_row_q = quantize_row_q6_k, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference, + .quantize_row_q_dot = quantize_row_q8_k, + .vec_dot_q = ggml_vec_dot_q6_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, }, }; @@ -3506,12 +3506,12 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), - [GGML_TYPE_Q2_K] = sizeof(block_q2_K), - [GGML_TYPE_Q3_K] = sizeof(block_q3_K), - [GGML_TYPE_Q4_K] = sizeof(block_q4_K), - [GGML_TYPE_Q5_K] = sizeof(block_q5_K), - [GGML_TYPE_Q6_K] = sizeof(block_q6_K), - [GGML_TYPE_Q8_K] = sizeof(block_q8_K), + [GGML_TYPE_Q2_K] = sizeof(block_q2_k), + [GGML_TYPE_Q3_K] = sizeof(block_q3_k), + [GGML_TYPE_Q4_K] = sizeof(block_q4_k), + [GGML_TYPE_Q5_K] = sizeof(block_q5_k), + [GGML_TYPE_Q6_K] = sizeof(block_q6_k), + [GGML_TYPE_Q8_K] = sizeof(block_q8_k), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), @@ -3528,12 +3528,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_K", - [GGML_TYPE_Q3_K] = "q3_K", - [GGML_TYPE_Q4_K] = "q4_K", - [GGML_TYPE_Q5_K] = "q5_K", - [GGML_TYPE_Q6_K] = "q6_K", - [GGML_TYPE_Q8_K] = "q8_K", + [GGML_TYPE_Q2_K] = "q2_k", + [GGML_TYPE_Q3_K] = "q3_k", + [GGML_TYPE_Q4_K] = "q4_k", + [GGML_TYPE_Q5_K] = "q5_k", + [GGML_TYPE_Q6_K] = "q6_k", + [GGML_TYPE_Q8_K] = "q8_k", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", @@ -16185,32 +16185,32 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_K * block = (block_q2_K*)dst + start / QK_K; - result = ggml_quantize_q2_K(src + start, block, n, n, hist); + block_q2_k * block = (block_q2_k*)dst + start / QK_K; + result = ggml_quantize_q2_k(src + start, block, n, n, hist); } break; case GGML_TYPE_Q3_K: { GGML_ASSERT(start % QK_K == 0); - block_q3_K * block = (block_q3_K*)dst + start / QK_K; - result = ggml_quantize_q3_K(src + start, block, n, n, hist); + block_q3_k * block = (block_q3_k*)dst + start / QK_K; + result = ggml_quantize_q3_k(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_K: { GGML_ASSERT(start % QK_K == 0); - block_q4_K * block = (block_q4_K*)dst + start / QK_K; - result = ggml_quantize_q4_K(src + start, block, n, n, hist); + block_q4_k * block = (block_q4_k*)dst + start / QK_K; + result = ggml_quantize_q4_k(src + start, block, n, n, hist); } break; case GGML_TYPE_Q5_K: { GGML_ASSERT(start % QK_K == 0); - block_q5_K * block = (block_q5_K*)dst + start / QK_K; - result = ggml_quantize_q5_K(src + start, block, n, n, hist); + block_q5_k * block = (block_q5_k*)dst + start / QK_K; + result = ggml_quantize_q5_k(src + start, block, n, n, hist); } break; case GGML_TYPE_Q6_K: { GGML_ASSERT(start % QK_K == 0); - block_q6_K * block = (block_q6_K*)dst + start / QK_K; - result = ggml_quantize_q6_K(src + start, block, n, n, hist); + block_q6_k * block = (block_q6_k*)dst + start / QK_K; + result = ggml_quantize_q6_k(src + start, block, n, n, hist); } break; default: assert(false);