Merge 'origin/master' into hipblas

2023-05-13 13:12:52 +03:00 · 2023-05-13 13:12:52 +03:00 · 2956630a3d
commit 2956630a3d
parent 0fe6384755 f048af0230
12 changed files with 413 additions and 199 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,18 @@
+---
+Checks: >
+    bugprone-*,
+    -bugprone-easily-swappable-parameters,
+    -bugprone-implicit-widening-of-multiplication-result,
+    -bugprone-narrowing-conversions,
+    readability-*,
+    -readability-avoid-unconditional-preprocessor-if,
+    -readability-function-cognitive-complexity,
+    -readability-identifier-length,
+    -readability-implicit-bool-conversion,
+    -readability-magic-numbers,
+    -readability-uppercase-literal-suffix,
+    clang-analyzer-*,
+    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
+    performance-*,
+    portability-*,
+FormatStyle: none
--- a/.github/workflows/tidy-post.yml
+++ b/.github/workflows/tidy-post.yml
@ -0,0 +1,20 @@
+name: clang-tidy review post comments
+
+on:
+  workflow_run:
+    workflows: ["clang-tidy-review"]
+    types:
+      - completed
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: ZedThree/clang-tidy-review/post@v0.13.0
+        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
+        with:
+          # adjust options as necessary
+          lgtm_comment_body: ''
+          annotations: false
+          max_comments: 25
--- a/.github/workflows/tidy-review.yml
+++ b/.github/workflows/tidy-review.yml
@ -0,0 +1,23 @@
+name: clang-tidy-review
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  clang-tidy-review:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: ZedThree/clang-tidy-review@v0.13.0
+      id: review
+      with:
+        lgtm_comment_body: ''
+        build_dir: build
+        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
+        split_workflow: true
+
+    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@ build-debug/
 build-release/
 build-static/
 build-cublas/
+build-opencl/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/README.md
+++ b/README.md
@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
+- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)

 <details>
@ -333,12 +333,12 @@ Several quantization methods are supported. They differ in the resulting model d

 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
+|    7B | perplexity   | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
 |    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
 |    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
 |    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
 |    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
-|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
+|   13B | perplexity   | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
 |   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
 |   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
 |   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -91,9 +91,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool escape_prompt = false;
    std::string arg;
    gpt_params default_params;
+    const std::string arg_prefix = "--";

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }

        if (arg == "-s" || arg == "--seed") {
 #if defined(GGML_USE_CUBLAS)
@ -141,27 +145,27 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
-        } else if (arg == "-n" || arg == "--n_predict") {
+        } else if (arg == "-n" || arg == "--n-predict") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top_k") {
+        } else if (arg == "--top-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
+        } else if (arg == "-c" || arg == "--ctx-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f32") {
+        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
-        } else if (arg == "--top_p") {
+        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@ -185,25 +189,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.typical_p = std::stof(argv[i]);
-        } else if (arg == "--repeat_last_n") {
+        } else if (arg == "--repeat-last-n") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_last_n = std::stoi(argv[i]);
-        } else if (arg == "--repeat_penalty") {
+        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--frequency_penalty") {
+        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.frequency_penalty = std::stof(argv[i]);
-        } else if (arg == "--presence_penalty") {
+        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@ -215,19 +219,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat_lr") {
+        } else if (arg == "--mirostat-lr") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat_ent") {
+        } else if (arg == "--mirostat-ent") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
+        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@ -310,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-        } else if (arg == "--n_parts") {
+        } else if (arg == "--n-parts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@ -384,31 +388,31 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(stderr, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stderr, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(stderr, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(stderr, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stderr, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stderr, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value\n");
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  --n-parts N           number of model parts (default: -1 = determine from dimensions)\n");
+    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    if (llama_mlock_supported()) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -121,7 +121,7 @@ int main(int argc, char ** argv) {
    // uncomment the "used_mem" line in llama.cpp to see the results
    if (params.mem_test) {
        {
-            const std::vector<llama_token> tmp(params.n_batch, 0);
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        }

--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@ -12,109 +12,129 @@
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 const char * clblast_dequant = MULTILINE_QUOTE(

+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+constant uint QK4_0 = 32;
 struct block_q4_0
 {
    float d;
-    uchar qs[16];
+    uint8_t qs[QK4_0 / 2];
 };

-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
+constant uint QK4_1 = 32;
 struct block_q4_1
 {
    float d;
    float m;
-    uchar qs[16];
+    uint8_t qs[QK4_1 / 2];
 };

-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
-struct block_q5_0
+constant uint QK5_0 = 32;
+struct __attribute__ ((packed)) block_q5_0
 {
-    float d;
-    uint qh;
-    uchar qs[16];
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
 };

-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
-}
-
+constant uint QK5_1 = 32;
 struct block_q5_1
 {
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
 };

-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
-}
-
+constant uint QK8_0 = 32;
 struct block_q8_0
 {
    float d;
-    char qs[32];
+    uint8_t qs[QK8_0];
 };

-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);

-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
 }

 );
@ -128,20 +148,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
        }                                                                                       \
    } while (0)

-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
-
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
@ -252,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
    cl_kernel kernel;
    size_t global = n * k, local, size_qb;
    bool dequant;
-    cl_block_q5_0* cl_host_b;

    switch (btype) {
    case GGML_TYPE_F32:
@ -274,18 +279,7 @@ void ggml_cl_sgemm_wrapper(
        dequant = true;
        kernel = kernel_q5_0;
        local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
        break;
    case GGML_TYPE_Q5_1:
        dequant = true;
@ -364,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
    clWaitForEvents(1, &ev_c);
    clReleaseEvent(ev_sgemm);
    clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
 }
--- a/ggml.c
+++ b/ggml.c
@ -472,7 +472,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 // quantization
 //

-#if __AVX__ || __AVX2__ || __AVX512F__
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 // multiply int8_t, add results pairwise twice
 static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
    // Get absolute values of x vectors
@ -485,6 +485,7 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
    return _mm_madd_epi16(ones, dot);
 }

+#if __AVX__ || __AVX2__ || __AVX512F__
 // horizontally add 8 floats
 static inline float hsum_float_8(const __m256 x) {
    __m128 res = _mm256_extractf128_ps(x, 1);
@ -596,7 +597,19 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
    return _mm_packus_epi16( bytes1, bytes2);
 }
 #endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
 #endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

 #if __ARM_NEON

@ -2129,6 +2142,126 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
    }

    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    // First round without accumulation
+    {
+        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        acc_0 = _mm_mul_ps( d_0_1, p0 );
+        acc_1 = _mm_mul_ps( d_0_1, p1 );
+        acc_2 = _mm_mul_ps( d_2_3, p2 );
+        acc_3 = _mm_mul_ps( d_2_3, p3 );
+    }
+
+    // Main loop
+    for (int i = 2; i < nb; i+=2) {
+        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
 #else
    // scalar
    float sumf = 0.0;
@ -8420,7 +8553,7 @@ static void ggml_compute_forward_alibi_f32(
                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                }

-                pdst[0] = (j+1) * m_k + src[0];
+                pdst[0] = i * m_k + src[0];
            }
        }
    }
@ -8482,7 +8615,7 @@ static void ggml_compute_forward_alibi_f16(
                }

                // we return F32
-                pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
+                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
            }
        }
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -50,49 +50,49 @@ static const size_t MB = 1024*1024;

 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
        { MODEL_65B,  1024ull * MB },
    };
-    return _MEM_REQ_SCRATCH0;
+    return k_sizes;
 }

 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
 {
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
        { MODEL_65B,  1024ull * MB },
    };
-    return _MEM_REQ_SCRATCH1;
+    return k_sizes;
 }

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 {
-    static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,   1026ull * MB },
        { MODEL_13B,  1608ull * MB },
        { MODEL_30B,  3124ull * MB },
        { MODEL_65B,  5120ull * MB },
    };
-    return _MEM_REQ_KV_SELF;
+    return k_sizes;
 }

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
 static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 {
-    static std::map<e_model, size_t> _MEM_REQ_EVAL = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,   768ull * MB },
        { MODEL_13B, 1024ull * MB },
        { MODEL_30B, 1280ull * MB },
        { MODEL_65B, 1536ull * MB },
    };
-    return _MEM_REQ_EVAL;
+    return k_sizes;
 }

 // default hparams (LLaMA 7B)
@ -586,12 +586,12 @@ struct llama_model_loader {
    std::unique_ptr<llama_mmap> mapping;

    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
+        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
        file_loaders.emplace_back(first_file);
        uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
        for (uint32_t i = 1; i < n_parts; i++) {
            std::string fname = fname_base + "." + std::to_string(i);
-            auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
+            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
            file_loaders.emplace_back(ith_file);
            if (ith_file->hparams != first_file->hparams) {
                throw format("llama.cpp: hparams inconsistent between files");
@ -638,7 +638,7 @@ struct llama_model_loader {
        }
    }

-    struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -667,7 +667,7 @@ struct llama_model_loader {
        return tensor;
    }

-    void done_getting_tensors() {
+    void done_getting_tensors() const {
        if (num_ggml_tensors_created != tensors_map.tensors.size()) {
            throw std::string("llama.cpp: file contained more tensors than expected");
        }
@ -934,7 +934,8 @@ static void llama_model_load_internal(

    auto & ctx = model.ctx;

-    size_t ctx_size, mmapped_size;
+    size_t ctx_size;
+    size_t mmapped_size;
    ml->calc_sizes(&ctx_size, &mmapped_size);
    fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);

@ -1074,7 +1075,7 @@ static bool llama_eval_internal(
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;

-    auto & kv_self = model.kv_self;
+    const auto & kv_self = model.kv_self;

    LLAMA_ASSERT(!!kv_self.ctx);

@ -1318,7 +1319,7 @@ static bool llama_eval_internal(
    }

    // extract embeddings
-    if (lctx.embedding.size()) {
+    if (!lctx.embedding.empty()) {
        auto & embedding_out = lctx.embedding;

        embedding_out.resize(n_embd);
@ -1369,6 +1370,8 @@ struct llama_sp_symbol {
    size_t n;
 };

+static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
+
 struct llama_sp_bigram {
    struct comparator {
        bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@ -1401,7 +1404,7 @@ struct llama_tokenizer {
            sym.prev = index - 1;
            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
-            symbols_.emplace_back(std::move(sym));
+            symbols_.emplace_back(sym);
        }

        // seed the work queue with all possible 2-character tokens.
@ -1492,7 +1495,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
    llama_tokenizer tokenizer(vocab);
    std::vector<llama_vocab::id> output;

-    if (text.size() == 0) {
+    if (text.empty()) {
        return output;
    }

@ -1728,7 +1731,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
    const int64_t t_start_sample_us = ggml_time_us();

    for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
        if (token_iter == last_tokens + last_tokens_size) {
            continue;
        }
@ -1872,7 +1875,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
    const int64_t t_start_sample_us = ggml_time_us();

    // Find max element
-    auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
        return a.logit < b.logit;
    });

@ -1925,7 +1928,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        nthread = std::thread::hardware_concurrency();
    }

-    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
+    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
                                                                            /*vocab_only*/ false));
    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);

@ -1979,7 +1982,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else if (tensor.type == GGML_TYPE_F16) {
                f32_conv_buf.resize(nelements * sizeof(float));
                f32_data = (float *) f32_conv_buf.addr;
-                auto f16_data = (const ggml_fp16_t *) tensor.data;
+                const auto * f16_data = (const ggml_fp16_t *) tensor.data;
                for (size_t i = 0; i < nelements; i++) {
                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                }
@ -2010,21 +2013,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                        size_t first = counter; counter += chunk_size;
                        if (first >= nelements) {
                            if (!local_hist.empty()) {
-                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
+                                for (int j=0; j<int(local_hist.size()); ++j) {
+                                    hist_cur[j] += local_hist[j];
+                                }
                                new_size += local_size;
                            }
                            break;
                        }
                        lock.unlock();
                        size_t last = std::min(nelements, first + chunk_size);
-                        if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
+                        if (local_hist.empty()) {
+                            local_hist.resize(hist_cur.size(), 0);
+                        }
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
-                if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
+                if ((int) workers.size() < nthread_use - 1) {
+                    workers.resize(nthread_use - 1);
+                }
+                for (int it = 0; it < nthread_use - 1; ++it) {
+                    workers[it] = std::thread(compute);
+                }
                compute();
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
+                for (int it = 0; it < nthread_use - 1; ++it) {
+                    workers[it].join();
+                }
            }

            printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@ -2222,7 +2235,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));

-        size_t ctx_size, mmapped_size;
+        size_t ctx_size;
+        size_t mmapped_size;
        model_loader->calc_sizes(&ctx_size, &mmapped_size);
        base_buf.resize(ctx_size);

@ -2261,8 +2275,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }

-        std::string name(length, 0);
-        fin.read(&name[0], length);
+        std::string name;
+        {
+            char buf[1024];
+            fin.read(buf, length);
+            name = std::string(buf, length);
+        }

        // check for lora suffix and get the type of tensor
        const std::string lora_suffix = ".lora";
@ -2277,7 +2295,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
        base_name.erase(pos);
        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());

-        if (model_tensors.find(base_name.data()) == model_tensors.end()) {
+        if (model_tensors.find(base_name) == model_tensors.end()) {
            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
            return 1;
        }
@ -2379,8 +2397,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            lora_tensors.clear();

            n_tensors++;
-            if (n_tensors % 4 == 0)
+            if (n_tensors % 4 == 0) {
                fprintf(stderr, ".");
+            }
        }
    }

@ -2409,7 +2428,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
    return ctx->model.kv_self.n;
 }

-#define LLAMA_MAX_RNG_STATE 64*1024
+#define LLAMA_MAX_RNG_STATE (64*1024)

 void llama_set_rng_seed(struct llama_context * ctx, int seed) {
    if (seed < 0) {
@ -2450,8 +2469,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
 }

 // Copies the state to the specified destination address
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
-    uint8_t * out = dest;
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
+    uint8_t * out = dst;

    // copy rng
    {
@ -2511,7 +2530,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {

        if (kv_size) {
            const size_t elt_size = ggml_element_size(kv_self.k);
+
            char buffer[4096];
+
            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
            gf.n_threads = 1;
@ -2535,10 +2556,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
            ggml_graph_compute(cpy_ctx, &gf);
+
+            ggml_free(cpy_ctx);
        }
    }

-    const size_t written  = out - dest;
+    const size_t written  = out - dst;
    const size_t max_size = llama_get_state_size(ctx);

    LLAMA_ASSERT(written <= max_size);
@ -2548,15 +2571,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {

 // Sets the state reading from the specified source address
 size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    const uint8_t * in = src;
+    const uint8_t * inp = src;

    // set rng
    {
        size_t rng_size;
        char   rng_buf[LLAMA_MAX_RNG_STATE];

-        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
-        memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
+        memcpy(&rng_size,   inp, sizeof(rng_size));    inp += sizeof(rng_size);
+        memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;

        std::stringstream rng_ss;
        rng_ss.str(std::string(&rng_buf[0], rng_size));
@ -2570,30 +2593,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
        size_t logits_cap;
        size_t logits_size;

-        memcpy(&logits_cap,  in, sizeof(logits_cap));  in += sizeof(logits_cap);
-        memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
+        memcpy(&logits_cap,  inp, sizeof(logits_cap));  inp += sizeof(logits_cap);
+        memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);

        LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);

        if (logits_size) {
            ctx->logits.resize(logits_size);
-            memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
        }

-        in += logits_cap * sizeof(float);
+        inp += logits_cap * sizeof(float);
    }

    // set embeddings
    {
        size_t embedding_size;

-        memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
+        memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);

        LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);

        if (embedding_size) {
-            memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
-            in += embedding_size * sizeof(float);
+            memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
+            inp += embedding_size * sizeof(float);
        }
    }

@ -2608,25 +2631,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
        size_t kv_size;
        int kv_ntok;

-        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
-        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+        memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
+        memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);

        if (kv_size) {
            LLAMA_ASSERT(kv_self.buf.size == kv_size);

            const size_t elt_size = ggml_element_size(kv_self.k);
+
            char buffer[4096];
+
            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
            ggml_cgraph gf{};
            gf.n_threads = 1;

            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
-            kin3d->data = (void *) in;
-            in += ggml_nbytes(kin3d);
+            kin3d->data = (void *) inp;
+            inp += ggml_nbytes(kin3d);

            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
-            vin3d->data = (void *) in;
-            in += ggml_nbytes(vin3d);
+            vin3d->data = (void *) inp;
+            inp += ggml_nbytes(vin3d);

            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
                n_embd, kv_ntok, n_layer,
@ -2639,12 +2664,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
            ggml_graph_compute(cpy_ctx, &gf);
+
+            ggml_free(cpy_ctx);
        }

        ctx->model.kv_self.n = kv_ntok;
    }

-    const size_t nread    = in - src;
+    const size_t nread    = inp - src;
    const size_t max_size = llama_get_state_size(ctx);

    LLAMA_ASSERT(nread <= max_size);
@ -2660,7 +2687,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
        const uint32_t magic   = file.read_u32();
        const uint32_t version = file.read_u32();

-        if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
            return false;
        }
--- a/llama.h
+++ b/llama.h
@ -134,7 +134,7 @@ extern "C" {
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);

    // Set the state reading from the specified address
    // Returns the number of bytes read