ggml : rename k_quants -> ggml-quants-k, use lowercase in code

2023-06-05 22:53:07 +03:00 · 2023-06-05 22:53:07 +03:00 · 12d43443b2
commit 12d43443b2
parent 32a5f3a601
6 changed files with 222 additions and 222 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -370,8 +370,8 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
-            k_quants.h
-            k_quants.c
+            ggml-quants-k.h
+            ggml-quants-k.c
            ${GGML_CUDA_SOURCES}
            ${GGML_OPENCL_SOURCES})

--- a/18
+++ b/18
@ -210,10 +210,10 @@ $(info )
 # Build library
 #

-ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
+ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
+ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

 llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
@ -232,25 +232,25 @@ clean:
 # Examples
 #

-main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o k_quants.o $(OBJS)
+quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o k_quants.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS)
+embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o k_quants.o $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
@ -272,7 +272,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@

-vdot: pocs/vdot/vdot.cpp ggml.o k_quants.o $(OBJS)
+vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 .PHONY: tests clean
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -94,24 +94,24 @@ typedef struct {
    uint8_t qs[QK_K/4];      // quants
    half d;                  // super-block scale for quantized scales
    half dmin;               // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+} block_q2_k;
+static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");

 typedef struct {
    uint8_t hmask[QK_K/8];
    uint8_t qs[QK_K/4]; // nibbles / quants
    uint8_t scales[3*QK_K/64];
    half d;
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
+} block_q3_k;
+static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");

 typedef struct {
    half d;                    // super-block scale for quantized scales
    half dmin;                 // super-block scale for quantized mins
    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+} block_q4_k;
+static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");

 typedef struct {
    half    d;                   // super-block scale for quantized scales
@ -119,16 +119,16 @@ typedef struct {
    uint8_t scales[3*QK_K/64];   // scales, quantized with 6 bits
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+} block_q5_k;
+static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");

 typedef struct {
    uint8_t ql[QK_K/2];   // quants, lower 4 bits
    uint8_t qh[QK_K/4];   // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales
    half    d;         // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+} block_q6_k;
+static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");

 #define WARP_SIZE 32

@ -233,7 +233,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int

 //================================== k-quants

-static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {

    const int i   = blockIdx.x;
    const int tid = threadIdx.x;
@ -241,7 +241,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;

-    const block_q2_K * x = (const block_q2_K *) vx;
+    const block_q2_k * x = (const block_q2_k *) vx;

    const uint8_t q = x[i].qs[32*n + l];
    float * y = yy + i*QK_K + 128*n;
@ -255,9 +255,9 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {

 }

-static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

-    const block_q2_K * x = (const block_q2_K *) vx;
+    const block_q2_k * x = (const block_q2_k *) vx;

    // if n is 0, we want to do the lower 128, else the upper 128,
    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
@ -286,7 +286,7 @@ static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs

 }

-static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {

    int r = threadIdx.x/4;
    int i = blockIdx.x;
@ -296,7 +296,7 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
    int n = tid / 4;
    int j = tid - 4*n;

-    const block_q3_K * x = (const block_q3_K *) vx;
+    const block_q3_k * x = (const block_q3_k *) vx;

    uint8_t m = 1 << (4*n + j);
    int is = 8*n + 2*j + is0;
@ -317,9 +317,9 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {

 }

-static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

-    const block_q3_K * x = (const block_q3_K *) vx;
+    const block_q3_k * x = (const block_q3_k *) vx;

    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
@ -371,8 +371,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
    }
 }

-static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
-    const block_q4_K * x = (const block_q4_K *) vx;
+static __global__ void dequantize_block_q4_k(const void * vx, float * yy) {
+    const block_q4_k * x = (const block_q4_k *) vx;

    const int i = blockIdx.x;

@ -408,9 +408,9 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
    }
 }

-static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

-    const block_q4_K * x = (const block_q4_K *) vx;
+    const block_q4_k * x = (const block_q4_k *) vx;

                                    // iqs is in 0...248 in steps of 8 =>
    const int j  = iqs / 64;        // j  is in 0...3
@ -440,8 +440,8 @@ static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs

 }

-static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
-    const block_q5_K * x = (const block_q5_K *) vx;
+static __global__ void dequantize_block_q5_k(const void * vx, float * yy) {
+    const block_q5_k * x = (const block_q5_k *) vx;

    const int i = blockIdx.x;

@ -473,9 +473,9 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 }

-static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

-    const block_q5_K * x = (const block_q5_K *) vx;
+    const block_q5_k * x = (const block_q5_k *) vx;

                                    // iqs is in 0...248 in steps of 8 =>
    const int j  = iqs / 64;        // j  is in 0...3
@ -510,8 +510,8 @@ static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs

 }

-static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
-    const block_q6_K * x = (const block_q6_K *) vx;
+static __global__ void dequantize_block_q6_k(const void * vx, float * yy) {
+    const block_q6_k * x = (const block_q6_k *) vx;

    const int i = blockIdx.x;

@ -535,9 +535,9 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
 }

-static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

-    const block_q6_K * x = (const block_q6_K *) vx;
+    const block_q6_k * x = (const block_q6_k *) vx;

    const int ip = iqs / 128;        // 0 or 1
    const int il = (iqs - 128*ip)/8; // 0...15
@ -701,29 +701,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
    dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

-static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
-    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+    dequantize_block_q2_k<<<nb, 64, 0, stream>>>(vx, y);
 }

-static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
-    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+    dequantize_block_q3_k<<<nb, 64, 0, stream>>>(vx, y);
 }

-static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
-    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+    dequantize_block_q4_k<<<nb, 32, 0, stream>>>(vx, y);
 }

-static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
-    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+    dequantize_block_q5_k<<<nb, 64, 0, stream>>>(vx, y);
 }

-static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
-    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+    dequantize_block_q6_k<<<nb, 64, 0, stream>>>(vx, y);
 }

 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@ -766,35 +766,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2;
    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 2, 1);
-    dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 2, 1);
-    dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 2, 1);
-    dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 2, 1);
-    dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@ -823,15 +823,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
        case GGML_TYPE_Q8_0:
            return dequantize_row_q8_0_cuda;
        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
+            return dequantize_row_q2_k_cuda;
        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
+            return dequantize_row_q3_k_cuda;
        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
+            return dequantize_row_q4_k_cuda;
        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
+            return dequantize_row_q5_k_cuda;
        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
+            return dequantize_row_q6_k_cuda;
        case GGML_TYPE_F16:
            return convert_fp16_to_fp32_cuda;
        default:
@ -852,15 +852,15 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
        case GGML_TYPE_Q8_0:
            return dequantize_mul_mat_vec_q8_0_cuda;
        case GGML_TYPE_Q2_K:
-            return dequantize_mul_mat_vec_q2_K_cuda;
+            return dequantize_mul_mat_vec_q2_k_cuda;
        case GGML_TYPE_Q3_K:
-            return dequantize_mul_mat_vec_q3_K_cuda;
+            return dequantize_mul_mat_vec_q3_k_cuda;
        case GGML_TYPE_Q4_K:
-            return dequantize_mul_mat_vec_q4_K_cuda;
+            return dequantize_mul_mat_vec_q4_k_cuda;
        case GGML_TYPE_Q5_K:
-            return dequantize_mul_mat_vec_q5_K_cuda;
+            return dequantize_mul_mat_vec_q5_k_cuda;
        case GGML_TYPE_Q6_K:
-            return dequantize_mul_mat_vec_q6_K_cuda;
+            return dequantize_mul_mat_vec_q6_k_cuda;
        case GGML_TYPE_F16:
            return convert_mul_mat_vec_f16_cuda;
        default:
--- a/ggml-quants-k.c
+++ b/ggml-quants-k.c
@ -1,4 +1,4 @@
-#include "k_quants.h"
+#include "ggml-quants-k.h"
 #include "ggml.h"

 #include <math.h>
@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *

 //========================- 2-bit (de)-quantization

-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -341,7 +341,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
    }
 }

-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -374,26 +374,26 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q2_K_reference(x, vy, k);
+void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) {
+    quantize_row_q2_k_reference(x, vy, k);
 }

-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    const int nb = k / QK_K;

    // TODO - collect histograms - although, at a second thought, I don't really care about them
    (void)hist;

    for (int j = 0; j < nb; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
-        quantize_row_q2_K_reference(src + j, y, k);
+        block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K;
+        quantize_row_q2_k_reference(src + j, y, k);
    }
-    return (n/QK_K*sizeof(block_q2_K));
+    return (n/QK_K*sizeof(block_q2_k));
 }

 //========================= 3-bit (de)-quantization

-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -469,7 +469,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
    }
 }

-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    assert(QK_K == 256);
    const int nb = k / QK_K;
@ -520,26 +520,26 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q3_K_reference(x, vy, k);
+void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) {
+    quantize_row_q3_k_reference(x, vy, k);
 }

-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    const int nb = k / QK_K;

    // TODO - collect histograms - although, at a second thought, I don't really care about them
    (void)hist;

    for (int j = 0; j < nb; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
-        quantize_row_q3_K_reference(src + j, y, k);
+        block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K;
+        quantize_row_q3_k_reference(src + j, y, k);
    }
-    return (n/QK_K*sizeof(block_q3_K));
+    return (n/QK_K*sizeof(block_q3_k));
 }

 // ====================== 4-bit (de)-quantization

-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -604,7 +604,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
    }
 }

-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -630,26 +630,26 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) {
    assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
-    quantize_row_q4_K_reference(x, y, k);
+    block_q4_k * restrict y = vy;
+    quantize_row_q4_k_reference(x, y, k);
 }

-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;
    (void)hist; // TODO: collect histograms
    for (int j = 0; j < nb; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
-        quantize_row_q4_K_reference(src + j, y, k);
+        block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K;
+        quantize_row_q4_k_reference(src + j, y, k);
    }
-    return (n/QK_K*sizeof(block_q4_K));
+    return (n/QK_K*sizeof(block_q4_k));
 }

 // ====================== 5-bit (de)-quantization

-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -731,7 +731,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
    }
 }

-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -759,26 +759,26 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) {
    assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
-    quantize_row_q5_K_reference(x, y, k);
+    block_q5_k * restrict y = vy;
+    quantize_row_q5_k_reference(x, y, k);
 }

-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;
    (void)hist;
    for (int j = 0; j < nb; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
-        quantize_row_q5_K_reference(src + j, y, k);
+        block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K;
+        quantize_row_q5_k_reference(src + j, y, k);
    }
-    return (n/QK_K*sizeof(block_q5_K));
+    return (n/QK_K*sizeof(block_q5_k));
 }

 // ====================== 6-bit (de)-quantization

-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -842,7 +842,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
    }
 }

-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -875,28 +875,28 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) {
    assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
-    quantize_row_q6_K_reference(x, y, k);
+    block_q6_k * restrict y = vy;
+    quantize_row_q6_k_reference(x, y, k);
 }

-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

    (void)hist; // TODO

    for (int j = 0; j < nb; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
-        quantize_row_q6_K_reference(src + j, y, k);
+        block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K;
+        quantize_row_q6_k_reference(src + j, y, k);
    }
-    return (n/QK_K*sizeof(block_q6_K));
+    return (n/QK_K*sizeof(block_q6_k));
 }

 //===================================== Q8_K ==============================================

-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -933,7 +933,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
    }
 }

-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@ -944,8 +944,8 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
    }
 }

-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q8_K_reference(x, y, k);
+void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q8_k_reference(x, y, k);
 }

 //===================================== Dot ptoducts =================================
@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif

-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {

-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_k * restrict x = vx;
+    const block_q8_k * restrict y = vy;

    const int nb = n / QK_K;

@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }

-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    assert(n % QK_K == 0);

    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;

-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_k * restrict x = vx;
+    const block_q8_k * restrict y = vy;

    const int nb = n / QK_K;

@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

 }

-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    assert(n % QK_K == 0);

-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_k * restrict x = vx;
+    const block_q8_k * restrict y = vy;

    const int nb = n / QK_K;

@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }

-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    assert(n % QK_K == 0);

-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_k * restrict x = vx;
+    const block_q8_k * restrict y = vy;

    const int nb = n / QK_K;

@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri



-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    assert(n % QK_K == 0);

-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_k * restrict x = vx;
+    const block_q8_k * restrict y = vy;

    const int nb = n / QK_K;

--- a/ggml-quants-k.h
+++ b/ggml-quants-k.h
@ -22,8 +22,8 @@ typedef struct {
    uint8_t qs[QK_K/4];      // quants
    ggml_fp16_t d;           // super-block scale for quantized scales
    ggml_fp16_t dmin;        // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+} block_q2_k;
+static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");

 // 3-bit quantization
 // weight is represented as x = a * q
@ -34,8 +34,8 @@ typedef struct {
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
+} block_q3_k;
+static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");

 // 4-bit quantization
 // 16 blocks of 32 elements each
@ -46,8 +46,8 @@ typedef struct {
    ggml_fp16_t dmin;          // super-block scale for quantized mins
    uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+} block_q4_k;
+static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");

 // 5-bit quantization
 // 16 blocks of 32 elements each
@ -59,8 +59,8 @@ typedef struct {
    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+} block_q5_k;
+static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");

 // 6-bit quantization
 // weight is represented as x = a * q
@ -71,52 +71,52 @@ typedef struct {
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    ggml_fp16_t d;           // super-block scale
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+} block_q6_k;
+static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding");

 // This is only used for intermediate quantization and dot products
 typedef struct {
    float   d;              // delta
    int8_t  qs[QK_K];       // quants
    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+} block_q8_k;
+static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding");


 // Quantization
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k);
+void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k);
+void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k);
+void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k);
+void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k);
+void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k);

-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q2_k(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_k(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_k(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_k(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_k(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_k(const float * restrict x, void * restrict y, int k);

 // Dequantization
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k);
+void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k);
+void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k);
+void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k);
+void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k);
+void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k);

 // Dot product
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);

 // Quantization with histogram collection
-size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist);

--- a/ggml.c
+++ b/ggml.c
@ -2,7 +2,7 @@
 #define _GNU_SOURCE

 #include "ggml.h"
-#include "k_quants.h"
+#include "ggml-quants-k.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@ -1567,43 +1567,43 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
    [GGML_TYPE_Q2_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q2_K,
-        .quantize_row_q           = quantize_row_q2_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q2_K_q8_K,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q2_k,
+        .quantize_row_q           = quantize_row_q2_k,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference,
+        .quantize_row_q_dot       = quantize_row_q8_k,
+        .vec_dot_q                = ggml_vec_dot_q2_k_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q3_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q3_K,
-        .quantize_row_q           = quantize_row_q3_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q3_K_q8_K,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q3_k,
+        .quantize_row_q           = quantize_row_q3_k,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference,
+        .quantize_row_q_dot       = quantize_row_q8_k,
+        .vec_dot_q                = ggml_vec_dot_q3_k_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q4_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_K,
-        .quantize_row_q           = quantize_row_q4_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q4_K_q8_K,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_k,
+        .quantize_row_q           = quantize_row_q4_k,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference,
+        .quantize_row_q_dot       = quantize_row_q8_k,
+        .vec_dot_q                = ggml_vec_dot_q4_k_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q5_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_K,
-        .quantize_row_q           = quantize_row_q5_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q5_K_q8_K,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_k,
+        .quantize_row_q           = quantize_row_q5_k,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference,
+        .quantize_row_q_dot       = quantize_row_q8_k,
+        .vec_dot_q                = ggml_vec_dot_q5_k_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q6_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q6_K,
-        .quantize_row_q           = quantize_row_q6_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q6_K_q8_K,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q6_k,
+        .quantize_row_q           = quantize_row_q6_k,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference,
+        .quantize_row_q_dot       = quantize_row_q8_k,
+        .vec_dot_q                = ggml_vec_dot_q6_k_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
 };
@ -3506,12 +3506,12 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-    [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
-    [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
-    [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
-    [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
-    [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
-    [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
+    [GGML_TYPE_Q2_K] = sizeof(block_q2_k),
+    [GGML_TYPE_Q3_K] = sizeof(block_q3_k),
+    [GGML_TYPE_Q4_K] = sizeof(block_q4_k),
+    [GGML_TYPE_Q5_K] = sizeof(block_q5_k),
+    [GGML_TYPE_Q6_K] = sizeof(block_q6_k),
+    [GGML_TYPE_Q8_K] = sizeof(block_q8_k),
    [GGML_TYPE_I8]   = sizeof(int8_t),
    [GGML_TYPE_I16]  = sizeof(int16_t),
    [GGML_TYPE_I32]  = sizeof(int32_t),
@ -3528,12 +3528,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
    [GGML_TYPE_Q5_1] = "q5_1",
    [GGML_TYPE_Q8_0] = "q8_0",
    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_Q2_K] = "q2_K",
-    [GGML_TYPE_Q3_K] = "q3_K",
-    [GGML_TYPE_Q4_K] = "q4_K",
-    [GGML_TYPE_Q5_K] = "q5_K",
-    [GGML_TYPE_Q6_K] = "q6_K",
-    [GGML_TYPE_Q8_K] = "q8_K",
+    [GGML_TYPE_Q2_K] = "q2_k",
+    [GGML_TYPE_Q3_K] = "q3_k",
+    [GGML_TYPE_Q4_K] = "q4_k",
+    [GGML_TYPE_Q5_K] = "q5_k",
+    [GGML_TYPE_Q6_K] = "q6_k",
+    [GGML_TYPE_Q8_K] = "q8_k",
    [GGML_TYPE_I8]   = "i8",
    [GGML_TYPE_I16]  = "i16",
    [GGML_TYPE_I32]  = "i32",
@ -16185,32 +16185,32 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
        case GGML_TYPE_Q2_K:
            {
                GGML_ASSERT(start % QK_K == 0);
-                block_q2_K * block = (block_q2_K*)dst + start / QK_K;
-                result = ggml_quantize_q2_K(src + start, block, n, n, hist);
+                block_q2_k * block = (block_q2_k*)dst + start / QK_K;
+                result = ggml_quantize_q2_k(src + start, block, n, n, hist);
            } break;
        case GGML_TYPE_Q3_K:
            {
                GGML_ASSERT(start % QK_K == 0);
-                block_q3_K * block = (block_q3_K*)dst + start / QK_K;
-                result = ggml_quantize_q3_K(src + start, block, n, n, hist);
+                block_q3_k * block = (block_q3_k*)dst + start / QK_K;
+                result = ggml_quantize_q3_k(src + start, block, n, n, hist);
            } break;
        case GGML_TYPE_Q4_K:
            {
                GGML_ASSERT(start % QK_K == 0);
-                block_q4_K * block = (block_q4_K*)dst + start / QK_K;
-                result = ggml_quantize_q4_K(src + start, block, n, n, hist);
+                block_q4_k * block = (block_q4_k*)dst + start / QK_K;
+                result = ggml_quantize_q4_k(src + start, block, n, n, hist);
            } break;
        case GGML_TYPE_Q5_K:
            {
                GGML_ASSERT(start % QK_K == 0);
-                block_q5_K * block = (block_q5_K*)dst + start / QK_K;
-                result = ggml_quantize_q5_K(src + start, block, n, n, hist);
+                block_q5_k * block = (block_q5_k*)dst + start / QK_K;
+                result = ggml_quantize_q5_k(src + start, block, n, n, hist);
            } break;
        case GGML_TYPE_Q6_K:
            {
                GGML_ASSERT(start % QK_K == 0);
-                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
-                result = ggml_quantize_q6_K(src + start, block, n, n, hist);
+                block_q6_k * block = (block_q6_k*)dst + start / QK_K;
+                result = ggml_quantize_q6_k(src + start, block, n, n, hist);
            } break;
        default:
            assert(false);