From 70d26ac3883009946e737525506fa88f52727132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 23 Jul 2023 17:49:06 +0200 Subject: [PATCH 1/3] Fix __dp4a documentation (#2348) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c9fe6187b..a0e0ea2e0 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. | + | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | From 4f06592cc6b83979e4b442e8cb97b3948c857188 Mon Sep 17 00:00:00 2001 From: IgnacioFDM Date: Sun, 23 Jul 2023 17:31:17 -0300 Subject: [PATCH 2/3] Add gqa parameter support to the server (#2351) * Add gqa parameter support to the server * Change help from stderr to stdout --- examples/server/server.cpp | 82 ++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f442f2b56..4ad0ba9ec 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -601,47 +601,48 @@ struct llama_server_context static void server_print_usage(const char *argv0, const gpt_params ¶ms, const server_params &sparams) { - fprintf(stderr, "usage: %s [options]\n", argv0); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + fprintf(stdout, "usage: %s [options]\n", argv0); + fprintf(stdout, "\n"); + fprintf(stdout, "options:\n"); + fprintf(stdout, " -h, --help show this help message and exit\n"); + fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); + fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); + fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); + fprintf(stdout, " number of layers to store in VRAM\n"); + fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); #endif - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); - fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); - fprintf(stderr, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); - fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); - fprintf(stderr, "\n"); + fprintf(stdout, " -m FNAME, --model FNAME\n"); + fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); + fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); + fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); + fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + fprintf(stdout, "\n"); } static void server_params_parse(int argc, char **argv, server_params &sparams, @@ -724,9 +725,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } + else if (arg == "-gqa" || arg == "--gqa") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_gqa = std::stoi(argv[i]); + } else if (arg == "--rope-freq-base") { - if (++i >= argc) { + if (++i >= argc) + { invalid_param = true; break; } @@ -734,7 +745,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { + if (++i >= argc) + { invalid_param = true; break; } From 2f9cf974a066ac0e03fbb235d834b01b0164d743 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 24 Jul 2023 00:19:47 +0300 Subject: [PATCH 3/3] Some more Q4_K and Q5_K speedup on CUDA (#2346) * Faster Q5_K on CUDA * Small Q5_K improvement on older GPUs * Spped up Q4_K on CUDA GTX1660: 29.5 ms/t -> 25.6 ms/t RTX4080: 8.40 ms/t -> 8.25 ms/t * Spped up Q4_K on CUDA GTX1660: 36.7 ms/t -> 35.6 ms/t RTX4080: 9.8 ms/t -> 9.5 ms/t * Address PR comments * Add some comments to satisfy PR reviewer --------- Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 114 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 84 insertions(+), 30 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6fb55d838..6823adc6c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1073,10 +1073,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + for (int i = ix; i < num_blocks_per_row; i += 2) { const uint8_t * ql1 = x[i].qs + q_offset; - const uint8_t * ql2 = ql1 + 64; const uint8_t * qh = x[i].qh + l0; const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; @@ -1092,15 +1094,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, float4 sum = {0.f, 0.f, 0.f, 0.f}; float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) - + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) - + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) - + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) - + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0)); smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; } @@ -1554,7 +1566,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_K * bq4_K = (const block_q4_K *) vbq; - const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6 + // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * (iqs / (QI8_1/2)); float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -1562,7 +1575,14 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const float d = bq4_K->d; const float dmin = bq4_K->dmin; - const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4)); + const int v1 = q4[0]; + const int v2 = q4[4]; const uint16_t * scales = (const uint16_t *)bq4_K->scales; uint16_t aux[2]; @@ -1580,13 +1600,19 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); const float d8i = bq8i->d; + const int * q8 = (const int *)bq8i->qs + (iqs%4); + const int ui1 = q8[0]; + const int ui2 = q8[4]; - const int vi = (v >> (4*i)) & 0x0F0F0F0F; + const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F; + const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F; - sumf_d += d8i * (__dp4a(vi, ui, 0) * sc[i]); // SIMD dot product - sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]); // multiply constant part of q4_K with sum of q8_1 values + const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + + sumf_d += d8i * (dot1 * sc[i]); + sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values } return d*sumf_d - dmin*sumf_m; @@ -1601,7 +1627,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q5_K * bq5_K = (const block_q5_K *) vbq; - const int bq8_offset = QR5_K * (iqs / QI8_1); + const int bq8_offset = QR5_K * (iqs / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4)); float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -1609,28 +1637,48 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d = bq5_K->d; const float dmin = bq5_K->dmin; - const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]); + const int vl1 = ql[0]; + const int vl2 = ql[4]; - const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset; + const int vh1 = qh[0] >> bq8_offset; + const int vh2 = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; for (int i = 0; i < QR5_K; ++i) { - const int isc = bq8_offset + i; - - uint8_t sc, m; - get_scale_min_k4(isc, bq5_K->scales, sc, m); const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); const float d8i = bq8i->d; + const int * q8 = (const int *)bq8i->qs + (iqs%4); + const int ui1 = q8[0]; + const int ui2 = q8[4]; - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F; + const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F; - const int vih = ((vh >> i) << 4) & 0x10101010; + const int vih1 = ((vh1 >> i) << 4) & 0x10101010; + const int vih2 = ((vh2 >> i) << 4) & 0x10101010; - const int vi = vil | vih; + const int vi1 = vil1 | vih1; + const int vi2 = vil2 | vih2; + + const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + + sumf_d += d8i * (dot1 * sc[i]); + sumf_m += d8i * (dot2 * m[i]); - sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product - sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values } return d*sumf_d - dmin*sumf_m; @@ -2306,7 +2354,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per + // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales + // is better amortized. + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2315,7 +2366,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q + // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per + // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales + // is better amortized. + mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); }