From 5c64a0952ee58b2d742ee84e8e3d43cce5d366db Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 7 Jun 2023 10:59:52 +0300 Subject: [PATCH 1/5] k-quants : allow to optionally disable at compile time (#1734) * k-quants : put behind optional compile flag LLAMA_K_QUANTS * build : enable k-quants by default --- CMakeLists.txt | 8 ++- Makefile | 37 ++++++---- ggml-cuda.cu | 120 ++++++++++++++++---------------- ggml.c | 107 ++++++++++++++++------------- ggml-quants-k.c => k_quants.c | 126 +++++++++++++++++----------------- ggml-quants-k.h => k_quants.h | 80 ++++++++++----------- 6 files changed, 250 insertions(+), 228 deletions(-) rename ggml-quants-k.c => k_quants.c (95%) rename ggml-quants-k.h => k_quants.h (52%) diff --git a/CMakeLists.txt b/CMakeLists.txt index da5913d69..456875f90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) +option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -226,6 +227,10 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_K_QUANTS) + set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) +endif() + if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) @@ -396,11 +401,10 @@ endif() add_library(ggml OBJECT ggml.c ggml.h - ggml-quants-k.h - ggml-quants-k.c ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_SOURCES_METAL} + ${GGML_SOURCES_EXTRA} ) target_include_directories(ggml PUBLIC .) diff --git a/Makefile b/Makefile index 0205f1959..39265164b 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +ifndef LLAMA_NO_K_QUANTS + CFLAGS += -DGGML_USE_K_QUANTS + OBJS += k_quants.o +endif + ifndef LLAMA_NO_ACCELERATE # Mac M1 - include Accelerate framework. # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). @@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS - CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS @@ -212,6 +217,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_NO_K_QUANTS +k_quants.o: k_quants.c k_quants.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_NO_K_QUANTS + # # Print build information # @@ -231,10 +241,7 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h - $(CC) $(CFLAGS) -c $< -o $@ - -ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h +ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h @@ -243,7 +250,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ -libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS) +libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: @@ -253,28 +260,28 @@ clean: # Examples # -main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) build-info.h: $(wildcard .git/index) scripts/build-info.sh @@ -289,11 +296,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh # Tests # -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ -vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS) +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) .PHONY: tests clean diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c7008905e..b1e513bc9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -110,24 +110,24 @@ typedef struct { uint8_t qs[QK_K/4]; // quants half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins -} block_q2_k; -static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); typedef struct { uint8_t hmask[QK_K/8]; uint8_t qs[QK_K/4]; // nibbles / quants uint8_t scales[3*QK_K/64]; half d; -} block_q3_k; -static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); +} block_q3_K; +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_k; -static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales @@ -135,16 +135,16 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_k; -static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales half d; // delta -} block_q6_k; -static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); #define WARP_SIZE 32 @@ -299,7 +299,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int //================================== k-quants -static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { +static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { const int i = blockIdx.x; const int tid = threadIdx.x; @@ -307,7 +307,7 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { const int l = tid - 32*n; const int is = 8*n + l/16; - const block_q2_k * x = (const block_q2_k *) vx; + const block_q2_K * x = (const block_q2_K *) vx; const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; @@ -321,9 +321,9 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { } -static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q2_k * x = (const block_q2_k *) vx; + const block_q2_K * x = (const block_q2_K *) vx; // if n is 0, we want to do the lower 128, else the upper 128, // covering y[l+0], y[l+32], y[l+64], y[l+96] and @@ -352,7 +352,7 @@ static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { +static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int r = threadIdx.x/4; int i = blockIdx.x; @@ -362,7 +362,7 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { int n = tid / 4; int j = tid - 4*n; - const block_q3_k * x = (const block_q3_k *) vx; + const block_q3_K * x = (const block_q3_K *) vx; uint8_t m = 1 << (4*n + j); int is = 8*n + 2*j + is0; @@ -383,9 +383,9 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { } -static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q3_k * x = (const block_q3_k *) vx; + const block_q3_K * x = (const block_q3_K *) vx; const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -437,8 +437,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } } -static __global__ void dequantize_block_q4_k(const void * vx, float * yy) { - const block_q4_k * x = (const block_q4_k *) vx; +static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { + const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -474,9 +474,9 @@ static __global__ void dequantize_block_q4_k(const void * vx, float * yy) { } } -static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q4_k * x = (const block_q4_k *) vx; + const block_q4_K * x = (const block_q4_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -506,8 +506,8 @@ static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q5_k(const void * vx, float * yy) { - const block_q5_k * x = (const block_q5_k *) vx; +static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { + const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -539,9 +539,9 @@ static __global__ void dequantize_block_q5_k(const void * vx, float * yy) { y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } -static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q5_k * x = (const block_q5_k *) vx; + const block_q5_K * x = (const block_q5_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -576,8 +576,8 @@ static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q6_k(const void * vx, float * yy) { - const block_q6_k * x = (const block_q6_k *) vx; +static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { + const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -601,9 +601,9 @@ static __global__ void dequantize_block_q6_k(const void * vx, float * yy) { y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } -static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q6_k * x = (const block_q6_k *) vx; + const block_q6_K * x = (const block_q6_K *) vx; const int ip = iqs / 128; // 0 or 1 const int il = (iqs - 128*ip)/8; // 0...15 @@ -804,29 +804,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q2_k<<>>(vx, y); + dequantize_block_q2_K<<>>(vx, y); } -static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q3_k<<>>(vx, y); + dequantize_block_q3_K<<>>(vx, y); } -static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q4_k<<>>(vx, y); + dequantize_block_q4_K<<>>(vx, y); } -static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q5_k<<>>(vx, y); + dequantize_block_q5_K<<>>(vx, y); } -static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q6_k<<>>(vx, y); + dequantize_block_q6_K<<>>(vx, y); } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -869,35 +869,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2; const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -926,15 +926,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q8_0: return dequantize_row_q8_0_cuda; case GGML_TYPE_Q2_K: - return dequantize_row_q2_k_cuda; + return dequantize_row_q2_K_cuda; case GGML_TYPE_Q3_K: - return dequantize_row_q3_k_cuda; + return dequantize_row_q3_K_cuda; case GGML_TYPE_Q4_K: - return dequantize_row_q4_k_cuda; + return dequantize_row_q4_K_cuda; case GGML_TYPE_Q5_K: - return dequantize_row_q5_k_cuda; + return dequantize_row_q5_K_cuda; case GGML_TYPE_Q6_K: - return dequantize_row_q6_k_cuda; + return dequantize_row_q6_K_cuda; case GGML_TYPE_F16: return convert_fp16_to_fp32_cuda; default: @@ -1277,19 +1277,19 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_F16: convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); diff --git a/ggml.c b/ggml.c index 045768faf..34212b841 100644 --- a/ggml.c +++ b/ggml.c @@ -2,7 +2,10 @@ #define _GNU_SOURCE #include "ggml.h" -#include "ggml-quants-k.h" + +#ifdef GGML_USE_K_QUANTS +#include "k_quants.h" +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -1580,46 +1583,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = NULL, // TODO .vec_dot_type = GGML_TYPE_Q8_1, }, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k, - .quantize_row_q = quantize_row_q2_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q2_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, + .quantize_row_q = quantize_row_q2_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k, - .quantize_row_q = quantize_row_q3_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q3_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, + .quantize_row_q = quantize_row_q3_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k, - .quantize_row_q = quantize_row_q4_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q4_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, + .quantize_row_q = quantize_row_q4_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k, - .quantize_row_q = quantize_row_q5_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q5_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, + .quantize_row_q = quantize_row_q5_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k, - .quantize_row_q = quantize_row_q6_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q6_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, + .quantize_row_q = quantize_row_q6_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, +#endif }; // For internal test use @@ -3499,12 +3504,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_1] = QK8_1, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = QK_K, [GGML_TYPE_Q3_K] = QK_K, [GGML_TYPE_Q4_K] = QK_K, [GGML_TYPE_Q5_K] = QK_K, [GGML_TYPE_Q6_K] = QK_K, [GGML_TYPE_Q8_K] = QK_K, +#endif [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, @@ -3520,12 +3527,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), - [GGML_TYPE_Q2_K] = sizeof(block_q2_k), - [GGML_TYPE_Q3_K] = sizeof(block_q3_k), - [GGML_TYPE_Q4_K] = sizeof(block_q4_k), - [GGML_TYPE_Q5_K] = sizeof(block_q5_k), - [GGML_TYPE_Q6_K] = sizeof(block_q6_k), - [GGML_TYPE_Q8_K] = sizeof(block_q8_k), +#ifdef GGML_USE_K_QUANTS + [GGML_TYPE_Q2_K] = sizeof(block_q2_K), + [GGML_TYPE_Q3_K] = sizeof(block_q3_K), + [GGML_TYPE_Q4_K] = sizeof(block_q4_K), + [GGML_TYPE_Q5_K] = sizeof(block_q5_K), + [GGML_TYPE_Q6_K] = sizeof(block_q6_K), + [GGML_TYPE_Q8_K] = sizeof(block_q8_K), +#endif [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), @@ -3542,12 +3551,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_k", - [GGML_TYPE_Q3_K] = "q3_k", - [GGML_TYPE_Q4_K] = "q4_k", - [GGML_TYPE_Q5_K] = "q5_k", - [GGML_TYPE_Q6_K] = "q6_k", - [GGML_TYPE_Q8_K] = "q8_k", + [GGML_TYPE_Q2_K] = "q2_K", + [GGML_TYPE_Q3_K] = "q3_K", + [GGML_TYPE_Q4_K] = "q4_K", + [GGML_TYPE_Q5_K] = "q5_K", + [GGML_TYPE_Q6_K] = "q6_K", + [GGML_TYPE_Q8_K] = "q8_K", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", @@ -16249,36 +16258,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; result = ggml_quantize_q8_0(src + start, block, n, n, hist); } break; +#ifdef GGML_USE_K_QUANTS case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_k * block = (block_q2_k*)dst + start / QK_K; - result = ggml_quantize_q2_k(src + start, block, n, n, hist); + block_q2_K * block = (block_q2_K*)dst + start / QK_K; + result = ggml_quantize_q2_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q3_K: { GGML_ASSERT(start % QK_K == 0); - block_q3_k * block = (block_q3_k*)dst + start / QK_K; - result = ggml_quantize_q3_k(src + start, block, n, n, hist); + block_q3_K * block = (block_q3_K*)dst + start / QK_K; + result = ggml_quantize_q3_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_K: { GGML_ASSERT(start % QK_K == 0); - block_q4_k * block = (block_q4_k*)dst + start / QK_K; - result = ggml_quantize_q4_k(src + start, block, n, n, hist); + block_q4_K * block = (block_q4_K*)dst + start / QK_K; + result = ggml_quantize_q4_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q5_K: { GGML_ASSERT(start % QK_K == 0); - block_q5_k * block = (block_q5_k*)dst + start / QK_K; - result = ggml_quantize_q5_k(src + start, block, n, n, hist); + block_q5_K * block = (block_q5_K*)dst + start / QK_K; + result = ggml_quantize_q5_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q6_K: { GGML_ASSERT(start % QK_K == 0); - block_q6_k * block = (block_q6_k*)dst + start / QK_K; - result = ggml_quantize_q6_k(src + start, block, n, n, hist); + block_q6_K * block = (block_q6_K*)dst + start / QK_K; + result = ggml_quantize_q6_K(src + start, block, n, n, hist); } break; +#endif default: assert(false); } diff --git a/ggml-quants-k.c b/k_quants.c similarity index 95% rename from ggml-quants-k.c rename to k_quants.c index dec00d371..4d524494d 100644 --- a/ggml-quants-k.c +++ b/k_quants.c @@ -1,4 +1,4 @@ -#include "ggml-quants-k.h" +#include "k_quants.h" #include "ggml.h" #include @@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) { +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -341,7 +341,7 @@ void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict } } -void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -374,26 +374,26 @@ void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int } } -void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_k_reference(x, vy, k); +void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K; - quantize_row_q2_k_reference(src + j, y, k); + block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + quantize_row_q2_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q2_k)); + return (n/QK_K*sizeof(block_q2_K)); } //========================= 3-bit (de)-quantization -void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) { +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -469,7 +469,7 @@ void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict } } -void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); assert(QK_K == 256); const int nb = k / QK_K; @@ -520,26 +520,26 @@ void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int } } -void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_k_reference(x, vy, k); +void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K; - quantize_row_q3_k_reference(src + j, y, k); + block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + quantize_row_q3_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q3_k)); + return (n/QK_K*sizeof(block_q3_K)); } // ====================== 4-bit (de)-quantization -void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) { +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -604,7 +604,7 @@ void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict } } -void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -630,26 +630,26 @@ void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int } } -void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q4_k * restrict y = vy; - quantize_row_q4_k_reference(x, y, k); + block_q4_K * restrict y = vy; + quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO: collect histograms for (int j = 0; j < nb; j += k) { - block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K; - quantize_row_q4_k_reference(src + j, y, k); + block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + quantize_row_q4_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q4_k)); + return (n/QK_K*sizeof(block_q4_K)); } // ====================== 5-bit (de)-quantization -void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) { +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -731,7 +731,7 @@ void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict } } -void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -759,26 +759,26 @@ void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int } } -void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q5_k * restrict y = vy; - quantize_row_q5_k_reference(x, y, k); + block_q5_K * restrict y = vy; + quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; for (int j = 0; j < nb; j += k) { - block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K; - quantize_row_q5_k_reference(src + j, y, k); + block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + quantize_row_q5_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q5_k)); + return (n/QK_K*sizeof(block_q5_K)); } // ====================== 6-bit (de)-quantization -void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) { +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -842,7 +842,7 @@ void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict } } -void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -875,28 +875,28 @@ void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int } } -void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q6_k * restrict y = vy; - quantize_row_q6_k_reference(x, y, k); + block_q6_K * restrict y = vy; + quantize_row_q6_K_reference(x, y, k); } -size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO for (int j = 0; j < nb; j += k) { - block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K; - quantize_row_q6_k_reference(src + j, y, k); + block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + quantize_row_q6_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q6_k)); + return (n/QK_K*sizeof(block_q6_K)); } //===================================== Q8_K ============================================== -void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) { +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -933,7 +933,7 @@ void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict } } -void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -944,8 +944,8 @@ void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int } } -void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_k_reference(x, y, k); +void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { + quantize_row_q8_K_reference(x, y, k); } //===================================== Dot ptoducts ================================= @@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const block_q2_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q2_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q3_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restri } -void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q4_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q4_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q5_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q5_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri -void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q6_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q6_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; diff --git a/ggml-quants-k.h b/k_quants.h similarity index 52% rename from ggml-quants-k.h rename to k_quants.h index d6f06013b..10a0baac7 100644 --- a/ggml-quants-k.h +++ b/k_quants.h @@ -22,8 +22,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t dmin; // super-block scale for quantized mins -} block_q2_k; -static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); // 3-bit quantization // weight is represented as x = a * q @@ -34,8 +34,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits ggml_fp16_t d; // super-block scale -} block_q3_k; -static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); +} block_q3_K; +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); // 4-bit quantization // 16 blocks of 32 elements each @@ -46,8 +46,8 @@ typedef struct { ggml_fp16_t dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_k; -static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); // 5-bit quantization // 16 blocks of 32 elements each @@ -59,8 +59,8 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_k; -static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); // 6-bit quantization // weight is represented as x = a * q @@ -71,52 +71,52 @@ typedef struct { uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits ggml_fp16_t d; // super-block scale -} block_q6_k; -static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding"); +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding"); // This is only used for intermediate quantization and dot products typedef struct { float d; // delta int8_t qs[QK_K]; // quants int16_t bsums[QK_K/16]; // sum of quants in groups of 16 -} block_q8_k; -static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding"); +} block_q8_K; +static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); // Quantization -void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k); -void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k); -void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k); -void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k); -void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k); -void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k); +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); -void quantize_row_q2_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); // Dequantization -void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k); -void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k); -void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k); -void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k); -void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k); -void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k); +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); // Dot product -void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); // Quantization with histogram collection -size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); From 0035858273ebe0694926bf4414d279f3e1cd109d Mon Sep 17 00:00:00 2001 From: johnson442 <56517414+johnson442@users.noreply.github.com> Date: Thu, 8 Jun 2023 08:02:48 +0100 Subject: [PATCH 2/5] k-quants : add missing compile definition to CMakeLists (#1748) --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 456875f90..41f5bb737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,7 @@ endif() if (LLAMA_K_QUANTS) set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) + add_compile_definitions(GGML_USE_K_QUANTS) endif() if (LLAMA_CLBLAST) From 4161bdc04debb70bf5f275492b4d89fd9330087c Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 8 Jun 2023 10:08:23 +0300 Subject: [PATCH 3/5] metal : add Q4_K implementation (#1733) * Metal implementation for Q4_K Very slow for now: 42 ms / token, Q4_0 runs in 28 ms/token on my 30-core M2 Max GPU. * Optimizing Q4_K on metal The first token always takes longer, I guess because the metal kernel is being jit-compiled. So, using n = 128 to measure time. At this point Q4_K takes 29.5 ms / token compared to 27.2 ms / token for Q4_0. Quite a bit better than the initial attempt, but still not good enough. * Optimizing q4_K metal dot some more For n = 256 it is now 28.1 ms/token compared to 27 ms/token for q4_0. * Fix after merge with master --------- Co-authored-by: Iwan Kawrakow --- .clang-tidy | 18 ------ ggml-metal.m | 23 ++++++- ggml-metal.metal | 162 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 19 deletions(-) delete mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy deleted file mode 100644 index 1a42b9abc..000000000 --- a/.clang-tidy +++ /dev/null @@ -1,18 +0,0 @@ ---- -Checks: > - bugprone-*, - -bugprone-easily-swappable-parameters, - -bugprone-implicit-widening-of-multiplication-result, - -bugprone-narrowing-conversions, - readability-*, - -readability-avoid-unconditional-preprocessor-if, - -readability-function-cognitive-complexity, - -readability-identifier-length, - -readability-implicit-bool-conversion, - -readability-magic-numbers, - -readability-uppercase-literal-suffix, - clang-analyzer-*, - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, - performance-*, - portability-*, -FormatStyle: none diff --git a/ggml-metal.m b/ggml-metal.m index 0953af6a4..f2a637b7a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -49,9 +49,11 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); + GGML_METAL_DECL_KERNEL(get_rows_q4_k); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32); GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); @@ -133,9 +135,11 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); + GGML_METAL_ADD_KERNEL(get_rows_q4_k); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32); GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); @@ -517,7 +521,20 @@ void ggml_metal_graph_compute( nth1 = 4; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; } break; - default: GGML_ASSERT(false && "not implemented"); + case GGML_TYPE_Q4_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; + } break; + default: + { + fprintf(stderr, "Asserting on type %d\n",(int)src0t); + GGML_ASSERT(false && "not implemented"); + } }; @@ -540,6 +557,9 @@ void ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q4_K) { + [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -555,6 +575,7 @@ void ggml_metal_graph_compute( switch (src0->type) { case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index a359bebe2..cbcd59ad4 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -503,3 +503,165 @@ kernel void kernel_cpy_f32_f32( dst_data[i00] = src[0]; } } + +//============================================ k-quants ====================================================== + +#define QK_K 256 + +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_k; + +static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { + uchar4 r; + if (j < 4) { + r[0] = q[j+0] & 63; r[1] = q[j+4] & 63; + r[2] = q[j+1] & 63; r[3] = q[j+5] & 63; + } else { + r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); + r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); + } + return r; +} + +static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = x[i].d; + const float min = x[i].dmin; + + device const uint8_t * q = x[i].qs; + device const uint8_t * scales = x[i].scales; + + int is = 0; + for (int j = 0; j < QK_K; j += 64) { + const uchar4 sc = get_scale_min_k4(is, scales); + const float d1 = d * sc[0]; const float m1 = min * sc[1]; + const float d2 = d * sc[2]; const float m2 = min * sc[3]; + for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; + for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; + q += 32; is += 2; + } + + } +} + +kernel void kernel_get_rows_q4_k( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint tpig[[thread_position_in_grid]]) { + const int i = tpig; + const int r = ((device int32_t *) src1)[i]; + + dequantize_row_q4_k( + (device const block_q4_k *) ((device char *) src0 + r*nb01), + (device float *) ((device char *) dst + i*nb1), ne00); +} + +kernel void kernel_mul_mat_q4_k_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + threadgroup float * sum [[threadgroup(0)]], + uint2 tgpig[[threadgroup_position_in_grid]], + uint2 tpig[[thread_position_in_grid]], // we don't use this for now + uint2 tpitg[[thread_position_in_threadgroup]], + uint2 tptg[[threads_per_threadgroup]]) { + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + + device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb; + device const float * yy = (device const float *) src1 + r1*ne10; + + const uint nth = tptg.x*tptg.y; + const uint ith = tptg.y*tpitg.x + tpitg.y; + + const int tid = tpitg.y; // 0...16 + const int il = tid/4; // 0...3 + const int ir = tid%4; // 0...3 + const int n = 8; + const int is = 2*il; + + sum[ith] = 0.0f; + + float sumf = 0; + for (int i = tpitg.x; i < nb; i += tptg.x) { + + device const uint8_t * q = (x + i)->qs + 32*il + n*ir; + device const float * y = yy + i*QK_K + 64*il + n*ir; + device const uint8_t * scales = (x + i)->scales; + + const float dall = (float)((x + i)->d); + const float dmin = (float)((x + i)->dmin); + + const uchar4 sc = get_scale_min_k4(is, scales); + + float4 s = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < n; ++l) { + s[0] += y[l+ 0] * (q[l] & 0xF); s[1] += y[l+ 0]; + s[2] += y[l+32] * (q[l] >> 4); s[3] += y[l+32]; + } + sumf += dall * (s[0] * sc[0] + s[2] * sc[2]) - dmin * (s[1] * sc[1] + s[3] * sc[3]); + + } + sum[ith] = sumf; + + // + // Accumulate the sum from all threads in the threadgroup + // This version is slightly faster than the commented out one below, + // which I copy-pasted from ggerganov's q4_0 dot product for metal. + // + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%4 == 0) { + for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%16 == 0) { + for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith == 0) { + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; + dst[r1*ne0 + r0] = sum[0]; + } + + //// accumulate the sum from all threads in the threadgroup + //threadgroup_barrier(mem_flags::mem_threadgroup); + //for (uint i = nth/2; i > 0; i /= 2) { + // if (ith < i) { + // sum[ith] += sum[ith + i]; + // } + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + + //if (ith == 0) { + // dst[r1*ne0 + r0] = sum[0]; + //} +} From 53aba3f393f2e02a78ddaba2e934893a8bbf3246 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Jun 2023 10:09:08 +0300 Subject: [PATCH 4/5] clang-tidy : restore dot file from accidental deletion --- .clang-tidy | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..1a42b9abc --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,18 @@ +--- +Checks: > + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-implicit-widening-of-multiplication-result, + -bugprone-narrowing-conversions, + readability-*, + -readability-avoid-unconditional-preprocessor-if, + -readability-function-cognitive-complexity, + -readability-identifier-length, + -readability-implicit-bool-conversion, + -readability-magic-numbers, + -readability-uppercase-literal-suffix, + clang-analyzer-*, + -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, + performance-*, + portability-*, +FormatStyle: none From b50b570ed9d699d3d126d72fc02de92926bcd937 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 8 Jun 2023 00:12:28 -0700 Subject: [PATCH 5/5] ggml : fix fprintf warnings (#1720) --- ggml.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index 34212b841..567dbc1e1 100644 --- a/ggml.c +++ b/ggml.c @@ -14729,12 +14729,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n", + fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], - nb[0], nb[1], nb[2], nb[3], + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->data, tensor->name); } @@ -14743,13 +14743,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], - nb[0], nb[1], nb[2], nb[3], + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->n_tasks, tensor->data, tensor->name); @@ -14772,11 +14772,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %8d\n", "eval", (int) size_eval); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval); // header fprintf(fout, "\n");