From 553fd4d4b5f3314f338be8006f62a4fdf7670186 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 12 May 2023 15:40:53 +0200 Subject: [PATCH 01/10] Add clang-tidy reviews to CI (#1407) --- .clang-tidy | 18 ++++++++++++++++++ .github/workflows/tidy-post.yml | 20 ++++++++++++++++++++ .github/workflows/tidy-review.yml | 23 +++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 .clang-tidy create mode 100644 .github/workflows/tidy-post.yml create mode 100644 .github/workflows/tidy-review.yml diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..1a42b9abc --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,18 @@ +--- +Checks: > + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-implicit-widening-of-multiplication-result, + -bugprone-narrowing-conversions, + readability-*, + -readability-avoid-unconditional-preprocessor-if, + -readability-function-cognitive-complexity, + -readability-identifier-length, + -readability-implicit-bool-conversion, + -readability-magic-numbers, + -readability-uppercase-literal-suffix, + clang-analyzer-*, + -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, + performance-*, + portability-*, +FormatStyle: none diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml new file mode 100644 index 000000000..a58da0cd6 --- /dev/null +++ b/.github/workflows/tidy-post.yml @@ -0,0 +1,20 @@ +name: clang-tidy review post comments + +on: + workflow_run: + workflows: ["clang-tidy-review"] + types: + - completed + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: ZedThree/clang-tidy-review/post@v0.13.0 + # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup + with: + # adjust options as necessary + lgtm_comment_body: '' + annotations: false + max_comments: 25 diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml new file mode 100644 index 000000000..a4bc8d976 --- /dev/null +++ b/.github/workflows/tidy-review.yml @@ -0,0 +1,23 @@ +name: clang-tidy-review + +on: + pull_request: + branches: + - master + +jobs: + clang-tidy-review: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: ZedThree/clang-tidy-review@v0.13.0 + id: review + with: + lgtm_comment_body: '' + build_dir: build + cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on + split_workflow: true + + - uses: ZedThree/clang-tidy-review/upload@v0.13.0 From 773ee249fb6c14f791ac39f6ec05536f40dedc54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 12 May 2023 16:34:55 +0200 Subject: [PATCH 02/10] CLI args use - instead of _, backwards compatible (#1416) --- examples/common.cpp | 56 ++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index f3085b08e..80e35d2e9 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -91,9 +91,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool escape_prompt = false; std::string arg; gpt_params default_params; + const std::string arg_prefix = "--"; for (int i = 1; i < argc; i++) { arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } if (arg == "-s" || arg == "--seed") { #if defined(GGML_USE_CUBLAS) @@ -141,27 +145,27 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else if (arg == "-n" || arg == "--n_predict") { + } else if (arg == "-n" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top_k") { + } else if (arg == "--top-k") { if (++i >= argc) { invalid_param = true; break; } params.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx_size") { + } else if (arg == "-c" || arg == "--ctx-size") { if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--memory_f32") { + } else if (arg == "--memory-f32") { params.memory_f16 = false; - } else if (arg == "--top_p") { + } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; break; @@ -185,25 +189,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat_last_n") { + } else if (arg == "--repeat-last-n") { if (++i >= argc) { invalid_param = true; break; } params.repeat_last_n = std::stoi(argv[i]); - } else if (arg == "--repeat_penalty") { + } else if (arg == "--repeat-penalty") { if (++i >= argc) { invalid_param = true; break; } params.repeat_penalty = std::stof(argv[i]); - } else if (arg == "--frequency_penalty") { + } else if (arg == "--frequency-penalty") { if (++i >= argc) { invalid_param = true; break; } params.frequency_penalty = std::stof(argv[i]); - } else if (arg == "--presence_penalty") { + } else if (arg == "--presence-penalty") { if (++i >= argc) { invalid_param = true; break; @@ -215,19 +219,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat_lr") { + } else if (arg == "--mirostat-lr") { if (++i >= argc) { invalid_param = true; break; } params.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat_ent") { + } else if (arg == "--mirostat-ent") { if (++i >= argc) { invalid_param = true; break; } params.mirostat_tau = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch_size") { + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; break; @@ -310,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "--n_parts") { + } else if (arg == "--n-parts") { if (++i >= argc) { invalid_param = true; break; @@ -384,31 +388,31 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " --presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - fprintf(stderr, " --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - fprintf(stderr, " --mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - fprintf(stderr, " --mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); - fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --n-parts N number of model parts (default: -1 = determine from dimensions)\n"); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) { From fb62f924336c9746da9976c6ab3c2e6460258d54 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 May 2023 21:44:20 +0300 Subject: [PATCH 03/10] llama : fix --mtest option (close #1414) --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bd1c4ab55..8543414dd 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -121,7 +121,7 @@ int main(int argc, char ** argv) { // uncomment the "used_mem" line in llama.cpp to see the results if (params.mem_test) { { - const std::vector tmp(params.n_batch, 0); + const std::vector tmp(params.n_batch, llama_token_bos()); llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); } From 699b1ad7fe6f7b9e41d3cb41e61a8cc3ea5fc6b5 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 13 May 2023 09:01:15 +0300 Subject: [PATCH 04/10] opencl : fix kernels for the new formats (#1422) * Fix OpenCL kernels for the new formats * Fix Q5_0 alignment issues. --- ggml-opencl.c | 209 ++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 109 deletions(-) diff --git a/ggml-opencl.c b/ggml-opencl.c index 0e6e6770f..31ab13b25 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -12,109 +12,129 @@ #define MULTILINE_QUOTE(...) #__VA_ARGS__ const char * clblast_dequant = MULTILINE_QUOTE( +typedef uchar uint8_t; +typedef int int32_t; +typedef uint uint32_t; + +constant uint QK4_0 = 32; struct block_q4_0 { float d; - uchar qs[16]; + uint8_t qs[QK4_0 / 2]; }; -__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - +constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; - uchar qs[16]; + uint8_t qs[QK4_1 / 2]; }; -__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - const float m = blocks[i].m; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q5_0 +constant uint QK5_0 = 32; +struct __attribute__ ((packed)) block_q5_0 { - float d; - uint qh; - uchar qs[16]; + half d; + uint32_t qh; + uint8_t qs[QK5_0 / 2]; }; -__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint l2 = l * 2; - - const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; - const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; - - const uint index = i*32 + l2; - result[index + 0] = (((vi & 0xf) | vh0) - 16)*d; - result[index + 1] = (((vi >> 4) | vh1) - 16)*d; -} - +constant uint QK5_1 = 32; struct block_q5_1 { - ushort d; - ushort m; - uint qh; - uchar qs[16]; + half d; + half m; + uint32_t qh; + uint8_t qs[QK5_1 / 2]; }; -__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d); - const float m = vload_half(0, (__global half*) &blocks[i].m); - - const uchar vi = blocks[i].qs[l]; - - const uint l2 = l * 2; - - const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; - const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; - - const uint index = i*32 + l2; - result[index + 0] = ((vi & 0xf) | vh0)*d + m; - result[index + 1] = ((vi >> 4) | vh1)*d + m; -} - +constant uint QK8_0 = 32; struct block_q8_0 { float d; - char qs[32]; + uint8_t qs[QK8_0]; }; -__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; +__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { + constant uint qk = QK4_0; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + + const int x0 = (x[i].qs[j] & 0xf) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; +} + +__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { + constant uint qk = QK4_1; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + const float m = x[i].m; + + const int x0 = (x[i].qs[j] & 0xf); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} + +__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { + constant uint qk = QK5_0; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = vload_half(0, (__global half*) &x[i].d); + + uint32_t qh = x[i].qh; + + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; +} + +__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { + constant uint qk = QK5_1; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = vload_half(0, (__global half*) &x[i].d); + const float m = vload_half(0, (__global half*) &x[i].m); + + uint32_t qh = x[i].qh; + + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0xf) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} + +__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { + constant uint qk = QK8_0; + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + y[i*qk + j] = x[i].qs[j]*d; } ); @@ -128,20 +148,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f } \ } while (0) -#define QK5_0 32 -typedef struct { - ggml_fp16_t d; // delta - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} block_q5_0; - - -typedef struct { - float d; // delta - uint32_t qh; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} cl_block_q5_0; - static cl_platform_id platform; static cl_device_id device; static cl_context context; @@ -252,7 +258,6 @@ void ggml_cl_sgemm_wrapper( cl_kernel kernel; size_t global = n * k, local, size_qb; bool dequant; - cl_block_q5_0* cl_host_b; switch (btype) { case GGML_TYPE_F32: @@ -274,18 +279,7 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q5_0; local = 16; - // For some reason OpenCL seems to be incapable of working with structs of size 22. - // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU... - // TODO Find the reason, fix and remove workaround. - const block_q5_0* b = (const block_q5_0*) host_b; - cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); - for (size_t i = 0; i < global / 32; i++) { - cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d); - memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t)); - memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2); - } - host_b = (const float*) cl_host_b; - size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; + size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32; break; case GGML_TYPE_Q5_1: dequant = true; @@ -364,7 +358,4 @@ void ggml_cl_sgemm_wrapper( clWaitForEvents(1, &ev_c); clReleaseEvent(ev_sgemm); clReleaseEvent(ev_c); - if (btype == GGML_TYPE_Q5_0) { - free((void*) cl_host_b); - } } From 738ace394a6f8cf0174e90a97185d9e512c0e200 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 May 2023 09:08:52 +0300 Subject: [PATCH 05/10] llama : free ggml context in set / copy state data (close #1425) --- llama.cpp | 48 ++++++++++++++++++++++++++++-------------------- llama.h | 2 +- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0a47faa9d..f52671b67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2450,8 +2450,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) { } // Copies the state to the specified destination address -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { - uint8_t * out = dest; +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { + uint8_t * out = dst; // copy rng { @@ -2511,7 +2511,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { if (kv_size) { const size_t elt_size = ggml_element_size(kv_self.k); + char buffer[4096]; + ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; @@ -2535,10 +2537,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); ggml_graph_compute(cpy_ctx, &gf); + + ggml_free(cpy_ctx); } } - const size_t written = out - dest; + const size_t written = out - dst; const size_t max_size = llama_get_state_size(ctx); LLAMA_ASSERT(written <= max_size); @@ -2548,15 +2552,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { // Sets the state reading from the specified source address size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - const uint8_t * in = src; + const uint8_t * inp = src; // set rng { size_t rng_size; char rng_buf[LLAMA_MAX_RNG_STATE]; - memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size); - memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE; + memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); + memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; std::stringstream rng_ss; rng_ss.str(std::string(&rng_buf[0], rng_size)); @@ -2570,30 +2574,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { size_t logits_cap; size_t logits_size; - memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap); - memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size); + memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); + memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); if (logits_size) { ctx->logits.resize(logits_size); - memcpy(ctx->logits.data(), in, logits_size * sizeof(float)); + memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); } - in += logits_cap * sizeof(float); + inp += logits_cap * sizeof(float); } // set embeddings { size_t embedding_size; - memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size); + memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); if (embedding_size) { - memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float)); - in += embedding_size * sizeof(float); + memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); + inp += embedding_size * sizeof(float); } } @@ -2608,25 +2612,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { size_t kv_size; int kv_ntok; - memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); - memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); + memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); + memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); if (kv_size) { LLAMA_ASSERT(kv_self.buf.size == kv_size); const size_t elt_size = ggml_element_size(kv_self.k); + char buffer[4096]; + ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - kin3d->data = (void *) in; - in += ggml_nbytes(kin3d); + kin3d->data = (void *) inp; + inp += ggml_nbytes(kin3d); ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - vin3d->data = (void *) in; - in += ggml_nbytes(vin3d); + vin3d->data = (void *) inp; + inp += ggml_nbytes(vin3d); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, @@ -2639,12 +2645,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); ggml_graph_compute(cpy_ctx, &gf); + + ggml_free(cpy_ctx); } ctx->model.kv_self.n = kv_ntok; } - const size_t nread = in - src; + const size_t nread = inp - src; const size_t max_size = llama_get_state_size(ctx); LLAMA_ASSERT(nread <= max_size); diff --git a/llama.h b/llama.h index 1a65cd589..ca05645b9 100644 --- a/llama.h +++ b/llama.h @@ -134,7 +134,7 @@ extern "C" { // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest); + LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); // Set the state reading from the specified address // Returns the number of bytes read From cdd5350892b1d4e521e930c77341f858fcfcd433 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 May 2023 09:12:44 +0300 Subject: [PATCH 06/10] readme : update Q4_0 perplexities I think these were affected by the removal of the `round` during quantization --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eef95a99c..1d84a5e6d 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** -- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405) +- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405) - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
@@ -333,12 +333,12 @@ Several quantization methods are supported. They differ in the resulting model d | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| -| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 | +| 7B | perplexity | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 | | 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G | | 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 | | 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 | | 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 | -| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 | +| 13B | perplexity | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 | | 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G | | 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 | | 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 | From 6456a4eb9f9c1f4de8f6908ef100daa78802ad00 Mon Sep 17 00:00:00 2001 From: Rinne Date: Sat, 13 May 2023 15:24:20 +0800 Subject: [PATCH 07/10] embedding : remove unused code (#1426) --- examples/embedding/embedding.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e4b729128..bb3fd50a9 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -56,9 +56,6 @@ int main(int argc, char ** argv) { // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - if (params.verbose_prompt) { fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); From 0cd22e190aeaef867fa5db025b4d274f2fcfdcf6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 May 2023 11:23:15 +0300 Subject: [PATCH 08/10] llama : fix various warnings --- .gitignore | 1 + llama.cpp | 85 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index f5023e304..d231f3ff8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build-debug/ build-release/ build-static/ build-cublas/ +build-opencl/ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ diff --git a/llama.cpp b/llama.cpp index f52671b67..e564de7c8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -50,49 +50,49 @@ static const size_t MB = 1024*1024; static const std::map & MEM_REQ_SCRATCH0() { - static std::map _MEM_REQ_SCRATCH0 = { + static std::map k_sizes = { { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, { MODEL_65B, 1024ull * MB }, }; - return _MEM_REQ_SCRATCH0; + return k_sizes; } static const std::map & MEM_REQ_SCRATCH1() { - static std::map _MEM_REQ_SCRATCH1 = { + static std::map k_sizes = { { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, { MODEL_65B, 1024ull * MB }, }; - return _MEM_REQ_SCRATCH1; + return k_sizes; } // 2*n_embd*n_ctx*n_layer*sizeof(float16) static const std::map & MEM_REQ_KV_SELF() { - static std::map _MEM_REQ_KV_SELF = { + static std::map k_sizes = { { MODEL_7B, 1026ull * MB }, { MODEL_13B, 1608ull * MB }, { MODEL_30B, 3124ull * MB }, { MODEL_65B, 5120ull * MB }, }; - return _MEM_REQ_KV_SELF; + return k_sizes; } // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled static const std::map & MEM_REQ_EVAL() { - static std::map _MEM_REQ_EVAL = { + static std::map k_sizes = { { MODEL_7B, 768ull * MB }, { MODEL_13B, 1024ull * MB }, { MODEL_30B, 1280ull * MB }, { MODEL_65B, 1536ull * MB }, }; - return _MEM_REQ_EVAL; + return k_sizes; } // default hparams (LLaMA 7B) @@ -586,12 +586,12 @@ struct llama_model_loader { std::unique_ptr mapping; llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); file_loaders.emplace_back(first_file); uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); for (uint32_t i = 1; i < n_parts; i++) { std::string fname = fname_base + "." + std::to_string(i); - auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { throw format("llama.cpp: hparams inconsistent between files"); @@ -638,7 +638,7 @@ struct llama_model_loader { } } - struct ggml_tensor * get_tensor(const std::string & name, std::vector ne) { + struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); @@ -667,7 +667,7 @@ struct llama_model_loader { return tensor; } - void done_getting_tensors() { + void done_getting_tensors() const { if (num_ggml_tensors_created != tensors_map.tensors.size()) { throw std::string("llama.cpp: file contained more tensors than expected"); } @@ -934,7 +934,8 @@ static void llama_model_load_internal( auto & ctx = model.ctx; - size_t ctx_size, mmapped_size; + size_t ctx_size; + size_t mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); @@ -1074,7 +1075,7 @@ static bool llama_eval_internal( const auto & model = lctx.model; const auto & hparams = model.hparams; - auto & kv_self = model.kv_self; + const auto & kv_self = model.kv_self; LLAMA_ASSERT(!!kv_self.ctx); @@ -1318,7 +1319,7 @@ static bool llama_eval_internal( } // extract embeddings - if (lctx.embedding.size()) { + if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); @@ -1369,6 +1370,8 @@ struct llama_sp_symbol { size_t n; }; +static_assert(std::is_trivially_copyable::value, "llama_sp_symbol is not trivially copyable"); + struct llama_sp_bigram { struct comparator { bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { @@ -1401,7 +1404,7 @@ struct llama_tokenizer { sym.prev = index - 1; sym.next = offs == text.size() ? -1 : index + 1; index++; - symbols_.emplace_back(std::move(sym)); + symbols_.emplace_back(sym); } // seed the work queue with all possible 2-character tokens. @@ -1492,7 +1495,7 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co llama_tokenizer tokenizer(vocab); std::vector output; - if (text.size() == 0) { + if (text.empty()) { return output; } @@ -1728,7 +1731,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat const int64_t t_start_sample_us = ggml_time_us(); for (size_t i = 0; i < candidates->size; ++i) { - auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); + const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); if (token_iter == last_tokens + last_tokens_size) { continue; } @@ -1872,7 +1875,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da const int64_t t_start_sample_us = ggml_time_us(); // Find max element - auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit < b.logit; }); @@ -1925,7 +1928,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); @@ -1979,7 +1982,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (tensor.type == GGML_TYPE_F16) { f32_conv_buf.resize(nelements * sizeof(float)); f32_data = (float *) f32_conv_buf.addr; - auto f16_data = (const ggml_fp16_t *) tensor.data; + const auto * f16_data = (const ggml_fp16_t *) tensor.data; for (size_t i = 0; i < nelements; i++) { f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } @@ -2010,21 +2013,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t first = counter; counter += chunk_size; if (first >= nelements) { if (!local_hist.empty()) { - for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); @@ -2222,7 +2235,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); - size_t ctx_size, mmapped_size; + size_t ctx_size; + size_t mmapped_size; model_loader->calc_sizes(&ctx_size, &mmapped_size); base_buf.resize(ctx_size); @@ -2261,8 +2275,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); } - std::string name(length, 0); - fin.read(&name[0], length); + std::string name; + { + char buf[1024]; + fin.read(buf, length); + name = std::string(buf, length); + } // check for lora suffix and get the type of tensor const std::string lora_suffix = ".lora"; @@ -2277,7 +2295,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * base_name.erase(pos); // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); - if (model_tensors.find(base_name.data()) == model_tensors.end()) { + if (model_tensors.find(base_name) == model_tensors.end()) { fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); return 1; } @@ -2379,8 +2397,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * lora_tensors.clear(); n_tensors++; - if (n_tensors % 4 == 0) + if (n_tensors % 4 == 0) { fprintf(stderr, "."); + } } } @@ -2409,7 +2428,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->model.kv_self.n; } -#define LLAMA_MAX_RNG_STATE 64*1024 +#define LLAMA_MAX_RNG_STATE (64*1024) void llama_set_rng_seed(struct llama_context * ctx, int seed) { if (seed < 0) { @@ -2668,7 +2687,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi const uint32_t magic = file.read_u32(); const uint32_t version = file.read_u32(); - if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) { + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); return false; } From ac0cd259d54be7e45ffa2c7b2508812cf4f83ccf Mon Sep 17 00:00:00 2001 From: 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> Date: Sat, 13 May 2023 10:43:33 +0200 Subject: [PATCH 09/10] Adding SSE instructions to ggml_vec_dot_q4_0_q8_0 (#1413) --- ggml.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 096ccacfb..4eccd417a 100644 --- a/ggml.c +++ b/ggml.c @@ -472,7 +472,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // quantization // -#if __AVX__ || __AVX2__ || __AVX512F__ +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) // multiply int8_t, add results pairwise twice static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // Get absolute values of x vectors @@ -485,6 +485,7 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { return _mm_madd_epi16(ones, dot); } +#if __AVX__ || __AVX2__ || __AVX512F__ // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = _mm256_extractf128_ps(x, 1); @@ -596,7 +597,19 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } #endif +#elif defined(__SSSE3__) +// horizontally add 4x4 floats +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =_mm_hadd_ps(a, b); + __m128 res_1 =_mm_hadd_ps(c, d); + __m128 res =_mm_hadd_ps(res_0, res_1); + res =_mm_hadd_ps(res, res); + res =_mm_hadd_ps(res, res); + + return _mm_cvtss_f32(res); +} #endif // __AVX__ || __AVX2__ || __AVX512F__ +#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if __ARM_NEON @@ -2129,6 +2142,126 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + acc_0 = _mm_mul_ps( d_0_1, p0 ); + acc_1 = _mm_mul_ps( d_0_1, p1 ); + acc_2 = _mm_mul_ps( d_2_3, p2 ); + acc_3 = _mm_mul_ps( d_2_3, p3 ); + } + + // Main loop + for (int i = 2; i < nb; i+=2) { + _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #else // scalar float sumf = 0.0; From f048af0230ad5381bb20b9e3c1467ec6fc7debdf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 13 May 2023 11:54:33 +0300 Subject: [PATCH 10/10] ggml : sync alibi fix from ggml repo --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 4eccd417a..b42ca03fd 100644 --- a/ggml.c +++ b/ggml.c @@ -8553,7 +8553,7 @@ static void ggml_compute_forward_alibi_f32( m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - pdst[0] = (j+1) * m_k + src[0]; + pdst[0] = i * m_k + src[0]; } } } @@ -8615,7 +8615,7 @@ static void ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]); + pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); } } }