From 75ef7619a258a95d26f8d68cbc81216bd4cee419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 20 May 2024 11:52:10 +0200 Subject: [PATCH] add q4_1 q5_0 q5_1 support --- ggml-cuda/fattn-tile-f16.cu | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu index c6925c2a6..90746a711 100644 --- a/ggml-cuda/fattn-tile-f16.cu +++ b/ggml-cuda/fattn-tile-f16.cu @@ -330,6 +330,18 @@ void launch_fattn_tile_f16_V_type(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn_tile_f16_64_128< cols_per_block, parallel_blocks, type_k, qkk, qrk, dequantize_k, block_q4_0, QK4_0, QR4_0, dequantize_q4_0>(ctx, dst); break; + case GGML_TYPE_Q4_1: + launch_fattn_tile_f16_64_128< + cols_per_block, parallel_blocks, type_k, qkk, qrk, dequantize_k, block_q4_1, QK4_1, QR4_1, dequantize_q4_1>(ctx, dst); + break; + case GGML_TYPE_Q5_0: + launch_fattn_tile_f16_64_128< + cols_per_block, parallel_blocks, type_k, qkk, qrk, dequantize_k, block_q5_0, QK5_0, QR5_0, dequantize_q5_0>(ctx, dst); + break; + case GGML_TYPE_Q5_1: + launch_fattn_tile_f16_64_128< + cols_per_block, parallel_blocks, type_k, qkk, qrk, dequantize_k, block_q5_1, QK5_1, QR5_1, dequantize_q5_1>(ctx, dst); + break; case GGML_TYPE_Q8_0: launch_fattn_tile_f16_64_128< cols_per_block, parallel_blocks, type_k, qkk, qrk, dequantize_k, block_q8_0, QK8_0, QR8_0, dequantize_q8_0>(ctx, dst); @@ -352,6 +364,15 @@ void launch_fattn_tile_f16_K_type(ggml_backend_cuda_context & ctx, ggml_tensor * case GGML_TYPE_Q4_0: launch_fattn_tile_f16_V_type(ctx, dst); break; + case GGML_TYPE_Q4_1: + launch_fattn_tile_f16_V_type(ctx, dst); + break; + case GGML_TYPE_Q5_0: + launch_fattn_tile_f16_V_type(ctx, dst); + break; + case GGML_TYPE_Q5_1: + launch_fattn_tile_f16_V_type(ctx, dst); + break; case GGML_TYPE_Q8_0: launch_fattn_tile_f16_V_type(ctx, dst); break;