From a17a2683d8fdb899ba497d0c28ccafb28c62efb6 Mon Sep 17 00:00:00 2001
From: tslmy <tslmy@users.noreply.github.com>
Date: Thu, 6 Jul 2023 09:17:50 -0700
Subject: [PATCH 01/24] alpaca.sh : update model file name (#2074)

The original file name, `ggml-alpaca-7b-q4.bin`, implied the first-generation GGML. After the breaking changes (mentioned in https://github.com/ggerganov/llama.cpp/issues/382), `llama.cpp` requires GGML V3 now. Those model files are named `*ggmlv3*.bin`. We should change the example to an actually working model file, so that this thing is more likely to run out-of-the-box for more people, and less people would waste time downloading the old Alpaca model.
---
 examples/alpaca.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
index aef207f36..8d2bae691 100755
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin \
+./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
        --color \
        -f ./prompts/alpaca.txt \
        --ctx_size 2048 \

From 36680f6e40e4440c3ec3385d0b7e5ca8bb6c37f7 Mon Sep 17 00:00:00 2001
From: Judd <foldl@users.noreply.github.com>
Date: Fri, 7 Jul 2023 00:23:49 +0800
Subject: [PATCH 02/24] convert : update for baichuan (#2081)

1. guess n_layers;
2. relax warnings on context size;
3. add a note that its derivations are also supported.

Co-authored-by: Judd <foldl@boxvest.com>
---
 README.md                          | 2 +-
 convert.py                         | 6 ++++++
 examples/embedding/embedding.cpp   | 2 +-
 examples/main/main.cpp             | 2 +-
 examples/perplexity/perplexity.cpp | 2 +-
 examples/server/README.md          | 2 +-
 6 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 32f17c2d1..863aef123 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
-- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
+- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
 
 **Bindings:**
 
diff --git a/convert.py b/convert.py
index 142692776..66509b99c 100644
--- a/convert.py
+++ b/convert.py
@@ -154,9 +154,15 @@ class Params:
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
             n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
         n_head=n_embd // 128 # guessed
 
         return Params(
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 2b7eb39c5..03e801c2a 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 3a171925b..0f6391acb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
     }
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     } else if (params.n_ctx < 8) {
         fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index dd54ed3c4..fd4b03cb2 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
diff --git a/examples/server/README.md b/examples/server/README.md
index c5139c16b..ad9b6bb08 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,7 +7,7 @@ Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
--   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.

From dfd9fce6d65599bf33df43e616e85aa639bdae4c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 19:41:31 +0300
Subject: [PATCH 03/24] ggml : fix restrict usage

---
 ggml.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ggml.h b/ggml.h
index 24ca8ae22..d0710c555 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1514,9 +1514,15 @@ extern "C" {
     // Internal types and functions exposed for tests and benchmarks
     //
 
-    typedef void (*ggml_to_float_t)(const void * x, float * y, int k);
-    typedef void (*ggml_from_float_t)(const float * x, void * y, int k);
-    typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y);
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
     typedef struct {
         ggml_to_float_t   to_float;

From 481f793acc3882a09d45d8d2c3076ad3d1c60cfc Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Fri, 7 Jul 2023 11:34:18 +0800
Subject: [PATCH 04/24] Fix opencl by wrap #if-else-endif with \n (#2086)

---
 ggml-opencl.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index fa0bdbefb..eb214a836 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
     const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
     const int in = tid - step*im;                        // 0...15 or 0...7
 
-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
     const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
     const int is = 0;
-#else
+
+\n#else\n
+
     const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
     const int is = in / 4;
-#endif
+
+\n#endif\n
+
     const int ql_offset = 64*im + l0;
     const int qh_offset = 32*im + l0;
     const int s_offset  =  8*im + is;
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
 
         const float d = vload_half(0, &x[i].d);
 
-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
         float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                   + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                   + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                   + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                   +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
         tmp[16 * ix + tid] += sum;
-#else
+\n#else\n
         float sum = 0;
         for (int l = 0; l < 4; ++l) {
             sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                  + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
         }
         tmp[16 * ix + tid] += sum;
-#endif
+\n#endif\n
 
     }
 

From 3e08ae99ceb143d67f9273fda47541e9d98ff23f Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Fri, 7 Jul 2023 16:12:49 +0300
Subject: [PATCH 05/24] convert.py: add mapping for safetensors bf16 (#1598)

Fixes #1473
---
 convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert.py b/convert.py
index 66509b99c..7a2705e5c 100644
--- a/convert.py
+++ b/convert.py
@@ -828,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
 
 
 SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'BF16': DT_BF16,
     'F16': DT_F16,
     'F32': DT_F32,
     'I32': DT_I32,

From 72421402834141df6cbdcf595fe46dbd11874dce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Jul 2023 18:36:37 +0300
Subject: [PATCH 06/24] ggml : remove sched_yield() call in
 ggml_graph_compute_thread() (#2134)

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index d257c3d65..4ba7ac931 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16042,7 +16042,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // wait for other threads to finish
             const int last = node_n;
             do {
-                sched_yield();
+                //sched_yield();
                 node_n = atomic_load(&state->shared->node_n);
             } while (node_n == last);
         }

From 1d656d6360359cfdaaf5d64ed9690047b600dbcb Mon Sep 17 00:00:00 2001
From: Qingyou Meng <meng.qingyou@gmail.com>
Date: Sat, 8 Jul 2023 00:24:01 +0800
Subject: [PATCH 07/24] ggml : change ggml_graph_compute() API to not require
 context (#1999)

* ggml_graph_compute: deprecate using ggml_context, try resolve issue #287

* rewrite: no longer consider backward compitability; plan and make_plan

* minor: rename ctx as plan; const

* remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward

* add static ggml_graph_compute_sugar()

* minor: update comments

* reusable buffers

* ggml : more consistent naming + metal fixes

* ggml : fix docs

* tests : disable grad / opt + minor naming changes

* ggml : add ggml_graph_compute_with_ctx()

- backwards compatible API
- deduplicates a lot of copy-paste

* ci : enable test-grad0

* examples : factor out plan allocation into a helper function

* llama : factor out plan stuff into a helper function

* ci : fix env

* llama : fix duplicate symbols + refactor example benchmark

* ggml : remove obsolete assert + refactor n_tasks section

* ggml : fix indentation in switch

* llama : avoid unnecessary bool

* ggml : remove comments from source file and match order in header

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/build.yml                   |  13 +-
 examples/baby-llama/baby-llama.cpp            |  24 +-
 examples/benchmark/benchmark-matmult.cpp      |  29 +-
 examples/metal/metal.cpp                      |   3 +-
 .../train-text-from-scratch.cpp               |  27 +-
 ggml-metal.h                                  |   6 +-
 ggml-metal.m                                  |  11 +-
 ggml.c                                        | 760 +++++++++---------
 ggml.h                                        |  36 +-
 llama.cpp                                     |  54 +-
 tests/CMakeLists.txt                          |   2 +-
 tests/test-grad0.c                            |  37 +-
 tests/test-opt.c                              |  18 +-
 13 files changed, 571 insertions(+), 449 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 12481e8be..a576139ef 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,7 +16,9 @@ on:
     paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
 
 env:
- BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  GGML_NLOOP: 3
+  GGML_NITER: 1
 
 jobs:
   ubuntu-focal-make:
@@ -64,7 +66,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -99,7 +101,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   macOS-latest-make:
     runs-on: macos-latest
@@ -147,10 +149,11 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   windows-latest-cmake:
     runs-on: windows-latest
+
     env:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17
@@ -249,7 +252,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
         run: |
           cd build
-          ctest -C Release --verbose
+          ctest -C Release --verbose --timeout 900
 
       - name: Get commit hash
         id: commit
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 212f54d32..4965881ec 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
     return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor(
         struct ggml_tensor * tensor,
         int ndims,
@@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    std::vector<uint8_t> work_buffer;
+
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 1;
 
         get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
 
@@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(params);
 
             ggml_cgraph gf = {};
-            gf.n_threads = 1;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
     }
 
     print_matrix(model.tok_embeddings);
-
     printf("done\n");
+
     // ggml_free(kv_self.ctx);
     // ggml_free(model_lora.ctx);
     ggml_free(model.ctx);
+
     return 0;
 }
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 39d15caeb..f7215f43b 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -20,6 +20,17 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 float tensor_sum_elements(const ggml_tensor * tensor) {
     float sum = 0;
     if (tensor->type==GGML_TYPE_F32) {
@@ -159,13 +170,14 @@ int main(int argc, char ** argv)  {
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
 
-    gf.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    ggml_graph_compute(ctx, &gf);
+    std::vector<uint8_t> work_buffer;
+
+    ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
 
     TENSOR_DUMP(gf.nodes[0]);
 
@@ -187,7 +199,6 @@ int main(int argc, char ** argv)  {
 
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=benchmark_params.n_threads;
 
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +210,7 @@ int main(int argc, char ** argv)  {
 
     //printf("Creating compute graph\n");
     struct ggml_cgraph gf32 = ggml_build_forward(q32);
-    gf32.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     const int dimx = sizex;
     const int dimy = sizey;
@@ -221,14 +231,15 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute(ctx, &gf31);
+        ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
+
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
         double gflops = (double)(flops_per_matrix)/usec/1000.0;
         gflops_sum += gflops;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
             i,
-            gf31.n_threads,
+            benchmark_params.n_threads,
             sizex, sizey, sizez, flops_per_matrix,
             usec,gflops);
 
@@ -253,7 +264,7 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute(ctx, &gf32);
+        ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
     }
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index cdfe4bfe9..7438defde 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
     struct ggml_context * ctx_eval = NULL;
 
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-    gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init();
+    auto * ctx_metal = ggml_metal_init(1);
 
     const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
     const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c50eeb343..b96fdcdc4 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
@@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     gf->n_nodes = 0;
     gf->n_leafs = 0;
-    gf->work_size = 0;
     gf->perf_runs = 0;
     gf->perf_cycles = 0;
     gf->perf_time_us = 0;
-    gf->work = NULL;
 
     const auto & hparams = model->hparams;
     //const int n_ctx      = hparams.n_ctx;
@@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
     printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
+    // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
@@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
+    std::vector<uint8_t> work_buffer;
+
     printf("%s: begin training\n", __func__);
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        // ggml_cgraph gf = {};
-        gf->n_threads = params.n_threads;
-        gb->n_threads = params.n_threads;
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
@@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
             *gb = ggml_build_backward(ctx0, gf, true);
         }
 
-        ggml_graph_compute(ctx0, gf);
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
@@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_graph_compute(ctx0, gf);
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
 
@@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(cparams);
 
             ggml_cgraph gf = {};
-            gf.n_threads = params.n_threads;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+            ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
+
     llama_free(lctx);
     llama_free_model(lmodel);
     ggml_free(model.ctx);
diff --git a/ggml-metal.h b/ggml-metal.h
index b9e50ac74..928f1705c 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -34,9 +34,13 @@ extern "C" {
 
 struct ggml_metal_context;
 
-struct ggml_metal_context * ggml_metal_init(void);
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
 // - the mapping is used during computation to determine the arguments of the compute kernels
diff --git a/ggml-metal.m b/ggml-metal.m
index fd69c41fe..3f15f791f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
 };
 
 struct ggml_metal_context {
+    int n_cb;
+
     float * logits;
 
     id<MTLDevice>       device;
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
 @implementation GGMLMetalClass
 @end
 
-struct ggml_metal_context * ggml_metal_init(void) {
+struct ggml_metal_context * ggml_metal_init(int n_cb) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
+    ctx->n_cb   = n_cb;
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     free(ctx);
 }
 
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = n_cb;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_cb = gf->n_threads;
+    const int n_cb = ctx->n_cb;
 
     NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
 
diff --git a/ggml.c b/ggml.c
index 4ba7ac931..55b0aff03 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.src0         =*/ NULL,
         /*.src1         =*/ NULL,
         /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
         /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        /*.pad          =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
@@ -10718,8 +10717,6 @@ static void ggml_compute_forward_mul_mat(
 
         float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
 
-        assert(ne00 % 32 == 0);
-
         for (int64_t ic = 0; ic < ne11; ++ic) {
             vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
         }
@@ -15772,9 +15769,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
@@ -15945,12 +15939,13 @@ void clear_numa_thread_affinity(void) {}
 #endif
 
 struct ggml_compute_state_shared {
-    struct ggml_cgraph * cgraph;
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
 
-    int n_threads;
+    const int n_threads;
 
     // synchronization primitives
     atomic_int n_active; // num active threads
@@ -15974,9 +15969,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_cgraph * cgraph = state->shared->cgraph;
 
-    const int n_threads = state->shared->n_threads;
+    const struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cplan  * cplan  = state->shared->cplan;
+
+    const int * n_tasks_arr = cplan->n_tasks;
+    const int   n_threads   = state->shared->n_threads;
+
     set_numa_thread_affinity(state->ith, n_threads);
 
     int node_n = -1;
@@ -15989,15 +15988,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
             };
 
             if (node_n != -1) {
                 /* FINALIZE */
                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
                 if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = node->n_tasks;
+                    params.nth = n_tasks_arr[node_n];
                     ggml_compute_forward(&params, node);
                     ggml_graph_compute_perf_stats_node(node, state->shared);
                 }
@@ -16008,11 +16007,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct ggml_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = n_tasks_arr[node_n];
 
                 state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
 
-                params.nth = node->n_tasks;
+                params.nth = n_tasks;
 
                 /* INIT */
                 if (GGML_OP_HAS_INIT[node->op]) {
@@ -16020,7 +16020,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     ggml_compute_forward(&params, node);
                 }
 
-                if (node->n_tasks == 1) {
+                if (n_tasks == 1) {
                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                     // they do something more efficient than spinning (?)
                     params.type = GGML_TASK_COMPUTE;
@@ -16052,16 +16052,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         /* COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = n_tasks_arr[node_n];
 
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            /*.nth   =*/ n_tasks,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
         };
 
-        if (state->ith < node->n_tasks) {
+        if (state->ith < n_tasks) {
             ggml_compute_forward(&params, node);
         }
     }
@@ -16069,11 +16070,364 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
+struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        int n_tasks = 1;
+
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_CPY:
+            case GGML_OP_DUP:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+                    if (ggml_is_quantized(node->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ADD:
+            case GGML_OP_ADD1:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src0->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ACC:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src0->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SUB:
+            case GGML_OP_DIV:
+            case GGML_OP_SQR:
+            case GGML_OP_SQRT:
+            case GGML_OP_LOG:
+            case GGML_OP_SUM:
+            case GGML_OP_SUM_ROWS:
+            case GGML_OP_MEAN:
+            case GGML_OP_ARGMAX:
+            case GGML_OP_REPEAT:
+            case GGML_OP_REPEAT_BACK:
+            case GGML_OP_ABS:
+            case GGML_OP_SGN:
+            case GGML_OP_NEG:
+            case GGML_OP_STEP:
+            case GGML_OP_TANH:
+            case GGML_OP_ELU:
+            case GGML_OP_RELU:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_MUL:
+            case GGML_OP_GELU:
+            case GGML_OP_GELU_QUICK:
+            case GGML_OP_SILU:
+            case GGML_OP_SILU_BACK:
+            case GGML_OP_NORM:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_RMS_NORM_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_OUT_PROD:
+                {
+                    n_tasks = n_threads;
+
+                    // TODO: use different scheduling for different matrix sizes
+                    //const int nr0 = ggml_nrows(node->src0);
+                    //const int nr1 = ggml_nrows(node->src1);
+
+                    //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                    //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+                    size_t cur = 0;
+                    const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
+
+#if defined(GGML_USE_CUBLAS)
+                    if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                    } else
+#elif defined(GGML_USE_CLBLAST)
+                    if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                    } else
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        if (node->src0->type != GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                        }
+                    } else
+#endif
+                    if (node->src1->type != vec_dot_type) {
+                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
+                    } else {
+                        cur = 0;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SCALE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_SET:
+            case GGML_OP_CONT:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+            case GGML_OP_GET_ROWS:
+            case GGML_OP_GET_ROWS_BACK:
+            case GGML_OP_DIAG:
+            case GGML_OP_DIAG_MASK_ZERO:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_DIAG_MASK_INF:
+            case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_MAX_BACK:
+            case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_ALIBI:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CLAMP:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CONV_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src0->ne[3] == 1);
+                    GGML_ASSERT(node->src1->ne[2] == 1);
+                    GGML_ASSERT(node->src1->ne[3] == 1);
+
+                    size_t cur = 0;
+                    const int nk = node->src0->ne[0];
+
+                    if (node->src0->type == GGML_TYPE_F16 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(
+                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
+                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                );
+                    } else if (node->src0->type == GGML_TYPE_F32 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(
+                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
+                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                );
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_2D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src1->ne[3] == 1);
+
+                    const int64_t ne00 = node->src0->ne[0]; // W
+                    const int64_t ne01 = node->src0->ne[1]; // H
+                    const int64_t ne02 = node->src0->ne[2]; // C
+                    const int64_t ne03 = node->src0->ne[3]; // N
+
+                    const int64_t ne10 = node->src1->ne[0]; // W
+                    const int64_t ne11 = node->src1->ne[1]; // H
+                    const int64_t ne12 = node->src1->ne[2]; // C
+
+                    const int64_t nk = ne00*ne01;
+
+                    UNUSED(ne02);
+                    UNUSED(ne03);
+                    UNUSED(nk);
+
+                    size_t cur = 0;
+
+                    if (node->src0->type == GGML_TYPE_F16 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
+                    } else if (node->src0->type == GGML_TYPE_F32 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*      (ne10*ne11*ne12);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_FF:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t    D = node->src0->ne[0];
+                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_WIN_PART:
+            case GGML_OP_WIN_UNPART:
+            case GGML_OP_MAP_UNARY:
+            case GGML_OP_MAP_BINARY:
+            case GGML_OP_MAP_CUSTOM1:
+            case GGML_OP_MAP_CUSTOM2:
+            case GGML_OP_MAP_CUSTOM3:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks);
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_NONE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_COUNT:
+                {
+                    GGML_ASSERT(false);
+                } break;
+        }
+
+        cplan.n_tasks[i] = n_tasks;
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+    }
+
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
+
+    return cplan;
+}
+
+void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+    {
+        GGML_ASSERT(cplan);
+        GGML_ASSERT(cplan->n_threads > 0);
+
+        if (cplan->work_size > 0) {
+            GGML_ASSERT(cplan->work_data);
+        }
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            if (cgraph->nodes[i]->op != GGML_OP_NONE) {
+                GGML_ASSERT(cplan->n_tasks[i] > 0);
+            }
+        }
+    }
+
+    const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
+        /*.cgraph_plan             =*/ cplan,
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
@@ -16082,336 +16436,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
-    // initialize tasks + work buffer
-    {
-        size_t work_size = 0;
-
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
-
-            switch (node->op) {
-                case GGML_OP_CPY:
-                case GGML_OP_DUP:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ADD:
-                case GGML_OP_ADD1:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ACC:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SUB:
-                case GGML_OP_DIV:
-                case GGML_OP_SQR:
-                case GGML_OP_SQRT:
-                case GGML_OP_LOG:
-                case GGML_OP_SUM:
-                case GGML_OP_SUM_ROWS:
-                case GGML_OP_MEAN:
-                case GGML_OP_ARGMAX:
-                case GGML_OP_REPEAT:
-                case GGML_OP_REPEAT_BACK:
-                case GGML_OP_ABS:
-                case GGML_OP_SGN:
-                case GGML_OP_NEG:
-                case GGML_OP_STEP:
-                case GGML_OP_TANH:
-                case GGML_OP_ELU:
-                case GGML_OP_RELU:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_MUL:
-                case GGML_OP_GELU:
-                case GGML_OP_GELU_QUICK:
-                case GGML_OP_SILU:
-                case GGML_OP_SILU_BACK:
-                case GGML_OP_NORM:
-                case GGML_OP_RMS_NORM:
-                case GGML_OP_RMS_NORM_BACK:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD:
-                    {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
-                        size_t cur = 0;
-                        const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
-
-#if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                        }
-                        else
-#elif defined(GGML_USE_CLBLAST)
-                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                        if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                               //       the threads are still spinning
-                            if (node->src0->type != GGML_TYPE_F32) {
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            }
-                        } else
-#endif
-                        if (node->src1->type != vec_dot_type) {
-                            cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
-                        } else {
-                            cur = 0;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_SET:
-                case GGML_OP_CONT:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_PERMUTE:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_GET_ROWS:
-                case GGML_OP_GET_ROWS_BACK:
-                case GGML_OP_DIAG:
-                case GGML_OP_DIAG_MASK_ZERO:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                case GGML_OP_SOFT_MAX:
-                case GGML_OP_SOFT_MAX_BACK:
-                case GGML_OP_ROPE:
-                case GGML_OP_ROPE_BACK:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CLAMP:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CONV_1D:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src0->ne[3] == 1);
-                        GGML_ASSERT(node->src1->ne[2] == 1);
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        size_t cur = 0;
-                        const int nk = node->src0->ne[0];
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CONV_2D:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        const int64_t ne00 = node->src0->ne[0]; // W
-                        const int64_t ne01 = node->src0->ne[1]; // H
-                        const int64_t ne02 = node->src0->ne[2]; // C
-                        const int64_t ne03 = node->src0->ne[3]; // N
-
-                        const int64_t ne10 = node->src1->ne[0]; // W
-                        const int64_t ne11 = node->src1->ne[1]; // H
-                        const int64_t ne12 = node->src1->ne[2]; // C
-
-                        const int64_t nk = ne00*ne01;
-
-                        UNUSED(ne02);
-                        UNUSED(ne03);
-                        UNUSED(nk);
-
-                        size_t cur = 0;
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*      (ne10*ne11*ne12);
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_FF:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN_BACK:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t    D = node->src0->ne[0];
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_WIN_PART:
-                case GGML_OP_WIN_UNPART:
-                case GGML_OP_MAP_UNARY:
-                case GGML_OP_MAP_BINARY:
-                case GGML_OP_MAP_CUSTOM1:
-                case GGML_OP_MAP_CUSTOM2:
-                case GGML_OP_MAP_CUSTOM3:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_NONE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_COUNT:
-                    {
-                        GGML_ASSERT(false);
-                    } break;
-            }
-        }
-
-        if (cgraph->work != NULL && work_size > cgraph->work_size) {
-            GGML_ASSERT(false); // TODO: better handling
-        }
-
-        if (work_size > 0 && cgraph->work == NULL) {
-            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
-
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
-            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
-        }
-    }
-
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
@@ -16473,6 +16497,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     }
 }
 
+void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+
+    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
+    GGML_ASSERT(buf);
+
+    cplan.work_data = buf->data;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];
@@ -16511,14 +16546,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
             tensor->data,
             tensor->name);
 }
@@ -17254,9 +17288,6 @@ static enum ggml_opt_result ggml_opt_adam(
         struct ggml_cgraph * gb) {
     GGML_ASSERT(ggml_is_scalar(f));
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     // these will store the parameters we want to optimize
     struct ggml_tensor * ps[GGML_MAX_PARAMS];
 
@@ -17303,7 +17334,8 @@ static enum ggml_opt_result ggml_opt_adam(
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
+
+    ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17383,7 +17415,8 @@ static enum ggml_opt_result ggml_opt_adam(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17505,7 +17538,8 @@ static enum ggml_opt_result linesearch_backtracking(
 
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
+
+            ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17573,9 +17607,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         }
     }
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     const int m = params.lbfgs.m;
 
     // these will store the parameters we want to optimize
@@ -17627,7 +17658,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 
diff --git a/ggml.h b/ggml.h
index d0710c555..ab84bef68 100644
--- a/ggml.h
+++ b/ggml.h
@@ -65,7 +65,7 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       ggml_graph_compute(ctx0, &gf);
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //
@@ -418,9 +418,6 @@ extern "C" {
         struct ggml_tensor * src1;
         struct ggml_tensor * opt[GGML_MAX_OPT];
 
-        // thread scheduling
-        int n_tasks;
-
         // performance
         int     perf_runs;
         int64_t perf_cycles;
@@ -432,19 +429,27 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
+        int n_tasks[GGML_MAX_NODES];
+    };
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
-        int n_threads;
-
-        size_t work_size;
-        struct ggml_tensor * work;
 
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1290,15 +1295,22 @@ extern "C" {
 
     GGML_API void ggml_set_param(
             struct ggml_context * ctx,
-            struct ggml_tensor * tensor);
+            struct ggml_tensor  * tensor);
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
diff --git a/llama.cpp b/llama.cpp
index 02afdeb14..ee6ec0920 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
     (void) tensor;
 }
 
+//
+// ggml helpers
+//
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+//
+// memory sizes
+//
+
 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -321,6 +340,9 @@ struct llama_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
     llama_ctx_buffer buf_compute;
@@ -758,7 +780,6 @@ struct llama_model_loader {
 
 };
 
-
 //
 // kv cache
 //
@@ -1265,7 +1286,7 @@ static bool llama_eval_internal(
            const float * embd,
              const int   n_tokens,
              const int   n_past,
-             const int   n_threads,
+                   int   n_threads,
             const char * cgraph_fname) {
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
@@ -1306,10 +1327,11 @@ static bool llama_eval_internal(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
+    ggml_cgraph gf = {};
+
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
@@ -1593,6 +1615,7 @@ static bool llama_eval_internal(
 
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
+        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
     } else {
@@ -1612,10 +1635,10 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
 
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
     }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
 #endif
 
     if (cgraph_fname) {
@@ -2575,8 +2598,8 @@ void llama_free_model(struct llama_model * model) {
 }
 
 struct llama_context * llama_new_context_with_model(
-                             struct llama_model * model,
-            struct llama_context_params   params) {
+                 struct llama_model * model,
+        struct llama_context_params   params) {
 
     if (!model) {
         return nullptr;
@@ -2645,7 +2668,7 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->ctx_metal = ggml_metal_init();
+        ctx->ctx_metal = ggml_metal_init(1);
 
         void * data_ptr  = NULL;
         size_t data_size = 0;
@@ -2802,6 +2825,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     // read tensors and apply
     bool warned = false;
     int n_tensors = 0;
+
+    std::vector<uint8_t> work_buffer;
+
     while (true) {
         int32_t n_dims;
         int32_t length;
@@ -2966,8 +2992,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             }
 
             struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+
+            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
@@ -3120,7 +3146,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kout3d->data = out;
@@ -3140,7 +3165,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
@@ -3226,7 +3251,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kin3d->data = (void *) inp;
@@ -3246,7 +3270,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c..1acf050a7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-# llama_add_test(test-grad0.c) # SLOW
+llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index a3e25214b..da4001ce5 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,6 +10,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
 #define MAX_NARGS 3
 
 #undef MIN
@@ -49,7 +51,7 @@ float frand(void) {
 
 int irand(int n) {
     if (n == 0) return 0;
-    else return rand()%n;
+    return rand()%n;
 }
 
 void get_random_dims(int64_t * dims, int ndims) {
@@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
 float get_element(const struct ggml_tensor * t, int idx) {
     if (t->type == GGML_TYPE_F32) {
         return ((float *)t->data)[idx];
-    } else if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
-    } else {
-        assert(false);
-        return INFINITY;
     }
+
+    if (t->type == GGML_TYPE_I32) {
+        return ((int32_t *)t->data)[idx];
+    }
+
+    assert(false);
+    return INFINITY;
 }
 
 void set_element(struct ggml_tensor * t, int idx, float value) {
@@ -215,15 +219,14 @@ bool check_gradient(
     }
 
     struct ggml_cgraph gf = ggml_build_forward (f);
-    gf.n_threads = n_threads;
-
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
-    gb.n_threads = n_threads;
 
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
     ggml_graph_reset  (&gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx0, &gb);
+
+    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
     // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
@@ -236,15 +239,16 @@ bool check_gradient(
             const float xm = x0 - eps;
             const float xp = x0 + eps;
             set_element(x[i], k, xp);
-            ggml_graph_compute(ctx0, &gf);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f0 = ggml_get_f32_1d(f, 0);
 
             set_element(x[i], k, xm);
-            ggml_graph_compute(ctx0, &gf);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f1 = ggml_get_f32_1d(f, 0);
-
             const float g0 = (f0 - f1)/(2.0f*eps);
 
             set_element(x[i], k, x0);
@@ -252,12 +256,13 @@ bool check_gradient(
             // compute gradient using backward graph
             ggml_graph_reset  (&gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx0, &gb);
+
+            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
             const float g1 = get_element(x[i]->grad, k);
 
             const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
diff --git a/tests/test-opt.c b/tests/test-opt.c
index d001615ee..e928a7df7 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,6 +7,7 @@
 
 #define MAX_NARGS 2
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
 
 //
 // logging
@@ -33,7 +34,7 @@
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 
 
-float frand() {
+float frand(void) {
     return (float)rand()/(float)RAND_MAX;
 }
 
@@ -114,7 +115,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
     ((float *)t->data)[idx] = value;
 }
 
-int main(int argc, const char ** argv) {
+int main(void) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
@@ -137,10 +138,11 @@ int main(int argc, const char ** argv) {
     struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
     struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
 
-
     struct ggml_cgraph ge = ggml_build_forward(e);
-    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
     const float fe = ggml_get_f32_1d(e, 0);
     printf("%s: e = %.4f\n", __func__, fe);
 
@@ -148,8 +150,10 @@ int main(int argc, const char ** argv) {
 
     ggml_opt(ctx, opt_params, e);
 
-    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
     const float fe_opt = ggml_get_f32_1d(e, 0);
     printf("%s: original  e = %.4f\n", __func__, fe);
     printf("%s: optimized e = %.4f\n", __func__, fe_opt);

From a7e20edf2266169ccd97a4eb949a593d628fbd64 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Jul 2023 21:23:57 +0300
Subject: [PATCH 08/24] ci : switch threads to 1 (#2138)

---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a576139ef..f6a2dd6da 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,6 +19,7 @@ env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
   GGML_NITER: 1
+  GGML_N_THREADS: 1
 
 jobs:
   ubuntu-focal-make:

From 84525e7962bee0abef91108948bbf7f7bfdcf421 Mon Sep 17 00:00:00 2001
From: dylan <canardleteer@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:25:25 -0700
Subject: [PATCH 09/24] docker : add support for CUDA in docker (#1461)

Co-authored-by: canardleteer <eris.has.a.dad+github@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .devops/full-cuda.Dockerfile | 33 +++++++++++++++++++++++++++++++++
 .devops/main-cuda.Dockerfile | 32 ++++++++++++++++++++++++++++++++
 Makefile                     |  8 +++++++-
 README.md                    | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 .devops/full-cuda.Dockerfile
 create mode 100644 .devops/main-cuda.Dockerfile

diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
new file mode 100644
index 000000000..e5fcb37d6
--- /dev/null
+++ b/.devops/full-cuda.Dockerfile
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
new file mode 100644
index 000000000..30c01196a
--- /dev/null
+++ b/.devops/main-cuda.Dockerfile
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
diff --git a/Makefile b/Makefile
index 71415664b..6068cbe7b 100644
--- a/Makefile
+++ b/Makefile
@@ -163,7 +163,12 @@ ifdef LLAMA_CUBLAS
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+	NVCCFLAGS = --forward-unknown-to-host-compiler
+ifdef CUDA_DOCKER_ARCH
+	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else
+	NVCCFLAGS += -arch=native
+endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
@@ -187,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
+
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
diff --git a/README.md b/README.md
index 863aef123..7953fd3a0 100644
--- a/README.md
+++ b/README.md
@@ -731,6 +731,38 @@ or with a light image:
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
+### Docker With CUDA
+
+Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
+
+#### Building Locally
+
+```bash
+docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `CUDA_VERSION` set to `11.7.1`
+- `CUDA_DOCKER_ARCH` set to `all`
+
+The resulting images, are essentially the same as the non-CUDA images:
+
+1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
+
+#### Usage
+
+After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+```
+
 ### Contributing
 
 - Contributors can open PRs

From 061f5f8d2109bb7adcbd40f1b456d887c5a1df25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 8 Jul 2023 00:25:15 +0200
Subject: [PATCH 10/24] CUDA: add __restrict__ to mul mat vec kernels (#2140)

---
 ggml-cuda.cu | 53 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7965ff741..ec41e3524 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
 #endif //GGML_CUDA_DMMV_F16
 
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
+typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 typedef void (*ggml_cuda_op_t)(
@@ -131,7 +131,7 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
 
-typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
 
 //================================= k-quants
 
@@ -407,7 +407,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
 
 //================================== k-quants
 
-static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
 
     const int i   = blockIdx.x;
     const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +440,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
 
 }
 
-static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
 
     const int i = blockIdx.x;
     const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +504,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
 }
 #endif
 
-static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q4_K * x = (const block_q4_K *) vx;
 
     const int i = blockIdx.x;
@@ -544,7 +544,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q5_K * x = (const block_q5_K *) vx;
 
     const int i = blockIdx.x;
@@ -590,7 +590,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q6_K * x = (const block_q6_K *) vx;
 
     const int i = blockIdx.x;
@@ -634,7 +634,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
@@ -742,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
     if (row > nrows) return;
@@ -846,7 +846,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
     if (row > nrows) return;
@@ -949,7 +949,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
+static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
 
     const int row = blockIdx.x;
     const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1053,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
@@ -1171,7 +1171,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
-static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
@@ -1207,7 +1207,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
 }
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_block(const void * vx, float * y, const int k) {
+static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
 
     if (i >= k) {
@@ -1227,7 +1227,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
     y[iybs + iqs + y_offset] = v.y;
 }
 
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
 #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
     const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 
@@ -1252,7 +1252,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
 #endif // __CUDA_ARCH__ >= 600
 }
 
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
 #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
     const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
 
@@ -1277,7 +1277,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
 #endif // __CUDA_ARCH__ >= 600
 }
 
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
 #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 
@@ -1312,7 +1312,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
 #endif // __CUDA_ARCH__ >= 600
 }
 
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
 #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
     const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
 
@@ -1346,7 +1346,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
 #endif // __CUDA_ARCH__ >= 600
 }
 
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
 #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
     const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
 
@@ -1366,7 +1366,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
 }
 
 template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
@@ -1404,7 +1404,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
 }
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
+static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
     // qk = quantized weights per x block
     // qr = number of quantized weights per data value in x block
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1471,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
     }
 }
 
-static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
+static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
     const half * x = (const half *) vx;
 
     const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1518,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
 }
 
 static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
     const int row_stride_x, const int channel_stride_x) {
 
     const half * x = (const half *) vx;
@@ -2355,10 +2355,7 @@ inline void ggml_cuda_op_mul_mat_vec(
         src0->type == GGML_TYPE_Q5_1 ||
         src0->type == GGML_TYPE_Q8_0;
 
-    // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
-    // However, they have bad performance with Pascal cards.
-    // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
-    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
+    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
 #endif
 
     if (use_mul_mat_vec_q) {

From 64639555ff93c8ead2b80becb49cc6b60aeac240 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 8 Jul 2023 20:01:44 +0200
Subject: [PATCH 11/24] Fixed OpenLLaMA 3b CUDA mul_mat_vec_q (#2144)

---
 ggml-cuda.cu | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index ec41e3524..fd36f179b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -208,6 +208,7 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
 
 #define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
@@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
@@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 
     block_q8_1 * y = (block_q8_1 *) vy;
 
-    const int ib = i / QK8_0; // block index
-    const int iqs = i % QK8_0; // quant index
+    const int ib = i / QK8_1; // block index
+    const int iqs = i % QK8_1; // quant index
 
-    const float xi = x[i];
+    const float xi = i < ndata ? x[i] : 0.0f;
     float amax = fabsf(xi);
     float sum = xi;
 
@@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
     rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
+    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
 }
 
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2359,9 +2360,11 @@ inline void ggml_cuda_op_mul_mat_vec(
 #endif
 
     if (use_mul_mat_vec_q) {
+        int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
+        padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
         size_t as;
-        void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
+        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
 
         switch (src0->type) {
             case GGML_TYPE_Q4_0:
@@ -3105,7 +3108,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
 
 void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
     int nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
     const size_t nb1 = tensor->nb[1];
+
     ggml_backend backend = tensor->backend;
     struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
@@ -3134,11 +3141,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
         int64_t nrows_split = row_high - row_low;
 
         const size_t offset_split = row_low*nb1;
-        const size_t size = ggml_nbytes_split(tensor, nrows_split);
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
 
-        void * buf;
+        // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+
+        char * buf;
         CUDA_CHECK(cudaMalloc(&buf, size));
-        void * buf_host = (char*)data + offset_split;
+        char * buf_host = (char*)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
 
         cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
 

From 2492a53fd0d8372ecc67f49f07b581905175eea8 Mon Sep 17 00:00:00 2001
From: rankaiyx <rankaiyx@rankaiyx.com>
Date: Sun, 9 Jul 2023 15:38:42 +0800
Subject: [PATCH 12/24] readme : add more docs indexes (#2127)

* Update README.md to add more docs indexes

* Update README.md to add more docs indexes
---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7953fd3a0..318632989 100644
--- a/README.md
+++ b/README.md
@@ -783,5 +783,10 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 
 ### Docs
 
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [main](./examples/main/README.md)
+- [server](./examples/server/README.md)
+- [embd-input](./examples/embd-input/README.md)
+- [jeopardy](./examples/jeopardy/README.md)
+- [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

From 3bbc1a11f04a9adc0d0e08c2940ba4d2978755ab Mon Sep 17 00:00:00 2001
From: clyang <clyang@clyang.net>
Date: Sun, 9 Jul 2023 16:12:20 +0800
Subject: [PATCH 13/24] ggml : fix buidling with Intel MKL but ask for
 "cblas.h" issue (#2104) (#2115)

* Fix buidling with Intel MKL but ask for "cblas.h" issue

* Use angle brackets to indicate the system library
---
 CMakeLists.txt | 3 +++
 ggml.c         | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2404548f..eed7b1b7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,9 @@ if (LLAMA_BLAS)
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
         add_compile_definitions(GGML_USE_OPENBLAS)
+        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+        endif()
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
 
diff --git a/ggml.c b/ggml.c
index 55b0aff03..c10877a76 100644
--- a/ggml.c
+++ b/ggml.c
@@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
+#else
 #include <cblas.h>
+#endif
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)

From 18780e0a5e17348236230bbe891901b9b5718709 Mon Sep 17 00:00:00 2001
From: JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Date: Sun, 9 Jul 2023 05:20:43 -0300
Subject: [PATCH 14/24] readme : update Termux instructions (#2147)

The file pathing is significant when running models inside of Termux on Android devices. llama.cpp performance is improved with loading a .bin from the $HOME directory.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 318632989..daa71c2b9 100644
--- a/README.md
+++ b/README.md
@@ -695,7 +695,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
 
 For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
 
-Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
 
 ### Docker
 

From db4047ad5cd8eae04db3b2efe0245e69a376601a Mon Sep 17 00:00:00 2001
From: Nigel Bosch <pnigelb@gmail.com>
Date: Sun, 9 Jul 2023 03:56:18 -0500
Subject: [PATCH 15/24] main : escape prompt prefix/suffix (#2151)

---
 examples/common.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/common.cpp b/examples/common.cpp
index 3278a0643..93159c6df 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -418,6 +418,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
     if (escape_prompt) {
         process_escapes(params.prompt);
+        process_escapes(params.input_prefix);
+        process_escapes(params.input_suffix);
     }
 
     return true;

From 1d1630996920f889cdc08de26cebf2415958540e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 9 Jul 2023 05:59:53 -0300
Subject: [PATCH 16/24] llama : remove "first token must be BOS" restriction
 (#2153)

---
 llama.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ee6ec0920..a491f1c7e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1291,12 +1291,6 @@ static bool llama_eval_internal(
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
 
-    // enforce that the first token is BOS
-    if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
-        return false;
-    }
-
     const int64_t t_start_us = ggml_time_us();
 
     const int N = n_tokens;

From 5656d10599bd756dc0f17284e418e704200b43f3 Mon Sep 17 00:00:00 2001
From: Evan Miller <emmiller@gmail.com>
Date: Mon, 10 Jul 2023 11:49:56 -0400
Subject: [PATCH 17/24] mpi : add support for distributed inference via MPI
 (#2099)

* MPI support, first cut

* fix warnings, update README

* fixes

* wrap includes

* PR comments

* Update CMakeLists.txt

* Add GH workflow, fix test

* Add info to README

* mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099)

* mpi : add names for layer inputs + prep ggml_mpi_graph_compute()

* mpi : move all MPI logic into ggml-mpi

Not tested yet

* mpi : various fixes - communication now works but results are wrong

* mpi : fix output tensor after MPI compute (still not working)

* mpi : fix inference

* mpi : minor

* Add OpenMPI to GH action

* [mpi] continue-on-error: true

* mpi : fix after master merge

* [mpi] Link MPI C++ libraries to fix OpenMPI

* tests : fix new llama_backend API

* [mpi] use MPI_INT32_T

* mpi : factor out recv / send in functions and reuse

* mpi : extend API to allow usage with outer backends (e.g. Metal)

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/build.yml            |  34 ++++
 .gitignore                             |   1 +
 CMakeLists.txt                         |  24 +++
 Makefile                               |   9 ++
 README.md                              |  39 +++++
 examples/embd-input/embd-input-lib.cpp |   2 +-
 examples/embedding/embedding.cpp       |   4 +-
 examples/main/main.cpp                 |   4 +-
 examples/perplexity/perplexity.cpp     |   4 +-
 examples/quantize/quantize.cpp         |   4 +-
 examples/server/server.cpp             |   4 +-
 examples/simple/simple.cpp             |   4 +-
 ggml-metal.m                           |   1 +
 ggml-mpi.c                             | 216 +++++++++++++++++++++++++
 ggml-mpi.h                             |  39 +++++
 llama.cpp                              |  98 +++++++----
 llama.h                                |   4 +-
 tests/test-tokenizer-0.cpp             |   4 +
 18 files changed, 460 insertions(+), 35 deletions(-)
 create mode 100644 ggml-mpi.c
 create mode 100644 ggml-mpi.h

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f6a2dd6da..b6e21b4ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -104,6 +104,40 @@ jobs:
           cd build
           ctest --verbose --timeout 900
 
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
   macOS-latest-make:
     runs-on: macos-latest
 
diff --git a/.gitignore b/.gitignore
index 4fccec31b..faec869e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
+build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eed7b1b7b..cf6cd34f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -308,6 +309,28 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")
+        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+        # Even if you're only using the C header, C++ programs may bring in MPI
+        # C++ functions, so more linkage is needed
+        if (MPI_CXX_FOUND)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
+        endif()
+    else()
+        message(WARNING "MPI not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -476,6 +499,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
             ${GGML_SOURCES_EXTRA}
             )
 
diff --git a/Makefile b/Makefile
index 6068cbe7b..f887ed67e 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	OBJS     += ggml-mpi.o
+
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
diff --git a/README.md b/README.md
index daa71c2b9..63457b6ae 100644
--- a/README.md
+++ b/README.md
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
+
+Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON
+  ```
+
+Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 5fa4942be..26563821a 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 03e801c2a..5192d6df5 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 0f6391acb..07d8fc6ac 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fd4b03cb2..7e120ff12 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 1eb0f75d6..797d2f0c5 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend(false);
+    llama_backend_init(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2cbfc0018..296c5d646 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1079,7 +1079,7 @@ int main(int argc, char **argv)
         params.model_alias = params.model;
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     LOG_INFO("build info", {{"build", BUILD_NUMBER},
                             {"commit", BUILD_COMMIT}});
@@ -1309,5 +1309,7 @@ int main(int argc, char **argv)
         return 1;
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 2d913cebb..aa2c4352d 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
     llama_free( ctx );
     llama_free_model( model );
 
+    llama_backend_free();
+
     return 0;
 }
 
diff --git a/ggml-metal.m b/ggml-metal.m
index 3f15f791f..6473644c2 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
                 //}
 
                 switch (dst->op) {
+                    case GGML_OP_NONE:
                     case GGML_OP_RESHAPE:
                     case GGML_OP_VIEW:
                     case GGML_OP_TRANSPOSE:
diff --git a/ggml-mpi.c b/ggml-mpi.c
new file mode 100644
index 000000000..872e808de
--- /dev/null
+++ b/ggml-mpi.c
@@ -0,0 +1,216 @@
+#include "ggml-mpi.h"
+
+#include "ggml.h"
+
+#include <mpi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_mpi_context {
+    int rank;
+    int size;
+};
+
+void ggml_mpi_backend_init(void) {
+    MPI_Init(NULL, NULL);
+}
+
+void ggml_mpi_backend_free(void) {
+    MPI_Finalize();
+}
+
+struct ggml_mpi_context * ggml_mpi_init(void) {
+    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+
+    return ctx;
+}
+
+void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    free(ctx);
+}
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
+    return ctx->rank;
+}
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads) {
+    UNUSED(ctx_mpi);
+
+    // synchronize the worker node parameters with the root node
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+}
+
+static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
+    if (t == NULL) {
+        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
+        return -1;
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->nodes[i] == t) {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
+    return -1;
+}
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    MPI_Status status; UNUSED(status);
+
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    GGML_ASSERT(inp0 == gf->nodes[0]);
+
+    // distribute the compute graph into slices across the MPI nodes
+    //
+    // the main node (0) processes the last layers + the remainder of the compute graph
+    // and is responsible to pass the input tokens to the first node (1)
+    //
+    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
+    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
+    // ...
+    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
+    // node 0:   [(n-1) * n_per_node,            n_nodes)
+    //
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            ggml_mpi_tensor_recv(inp_tokens, 0);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+    }
+
+    {
+        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+
+        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
+
+        const int il0 =               (mpi_idx + 0) * n_per_node;
+        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+
+        char name_l0[GGML_MAX_NAME];
+        char name_l1[GGML_MAX_NAME];
+
+        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
+        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
+
+        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
+        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+
+        if (idx_l0 < 0 || idx_l1 < 0) {
+            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
+            return;
+        }
+
+        // attach the input data to all nodes that need it
+        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
+        for (int i = idx_l0; i < idx_l1; i++) {
+            if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src0 =  inp0;
+            }
+            if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src1 =  inp0;
+            }
+        }
+
+        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
+        for (int i = 1; i < idx_l1 - idx_l0; i++) {
+            gf->nodes[i] = gf->nodes[idx_l0 + i];
+            gf->grads[i] = gf->grads[idx_l0 + i];
+        }
+
+        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
+        if (mpi_idx != 0) {
+            gf->nodes[0]->op = GGML_OP_NONE;
+        }
+
+        gf->n_nodes = idx_l1 - idx_l0;
+
+        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
+    }
+}
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    UNUSED(n_layers);
+
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    // send the output data to the next node
+    if (mpi_rank > 0) {
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+    }
+}
diff --git a/ggml-mpi.h b/ggml-mpi.h
new file mode 100644
index 000000000..eda119d44
--- /dev/null
+++ b/ggml-mpi.h
@@ -0,0 +1,39 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mpi_context;
+
+void ggml_mpi_backend_init(void);
+void ggml_mpi_backend_free(void);
+
+struct ggml_mpi_context * ggml_mpi_init(void);
+void ggml_mpi_free(struct ggml_mpi_context * ctx);
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx);
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads);
+
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama.cpp b/llama.cpp
index a491f1c7e..ad7283faf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,9 @@
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
+#ifdef GGML_USE_MPI
+#include "ggml-mpi.h"
+#endif
 #ifdef GGML_USE_K_QUANTS
 #ifndef QK_K
 #ifdef GGML_QKK_64
@@ -352,6 +355,10 @@ struct llama_context {
     ggml_metal_context * ctx_metal = NULL;
 #endif
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
@@ -870,7 +877,7 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend(bool numa) {
+void llama_backend_init(bool numa) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -883,6 +890,16 @@ void llama_init_backend(bool numa) {
     if (numa) {
         ggml_numa_init();
     }
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_init();
+#endif
+}
+
+void llama_backend_free() {
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_free();
+#endif
 }
 
 int64_t llama_time_us() {
@@ -1284,13 +1301,17 @@ static bool llama_eval_internal(
          llama_context & lctx,
      const llama_token * tokens,
            const float * embd,
-             const int   n_tokens,
-             const int   n_past,
+                   int   n_tokens,
+                   int   n_past,
                    int   n_threads,
             const char * cgraph_fname) {
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
+#endif
+
     const int64_t t_start_us = ggml_time_us();
 
     const int N = n_tokens;
@@ -1331,11 +1352,16 @@ static bool llama_eval_internal(
     struct ggml_tensor * inpL;
 
     if (tokens) {
-        struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-        ggml_set_name(embd, "embd");
-        memcpy(embd->data, tokens, N*ggml_element_size(embd));
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+        ggml_set_name(inp_tokens, "inp_tokens");
+
+        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
         memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
     }
@@ -1353,18 +1379,20 @@ static bool llama_eval_internal(
     offload_func_t offload_func_v  = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
-        if (n_gpu_layers > n_layer) {
-            offload_func_nr = ggml_cuda_assign_buffers;
-        }
-        if (n_gpu_layers > n_layer + 1) {
-            offload_func_v  = ggml_cuda_assign_buffers;
-        }
-        if (n_gpu_layers > n_layer + 2) {
-            offload_func_kq = ggml_cuda_assign_buffers;
-        }
+    if (n_gpu_layers > n_layer) {
+        offload_func_nr = ggml_cuda_assign_buffers;
+    }
+    if (n_gpu_layers > n_layer + 1) {
+        offload_func_v  = ggml_cuda_assign_buffers;
+    }
+    if (n_gpu_layers > n_layer + 2) {
+        offload_func_kq = ggml_cuda_assign_buffers;
+    }
 #endif // GGML_USE_CUBLAS
 
     for (int il = 0; il < n_layer; ++il) {
+        ggml_format_name(inpL, "layer_inp_%d", il);
+
         offload_func_t offload_func = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
@@ -1571,7 +1599,6 @@ static bool llama_eval_internal(
 
         // input for next layer
         inpL = cur;
-
     }
 
     lctx.use_buf(ctx0, 0);
@@ -1579,7 +1606,6 @@ static bool llama_eval_internal(
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
 
-
     // norm
     {
         cur = ggml_rms_norm(ctx0, inpL);
@@ -1594,7 +1620,6 @@ static bool llama_eval_internal(
         embeddings = cur;
     }
 
-
     // lm_head
     cur = ggml_mul_mat(ctx0, model.output, cur);
     ggml_set_name(cur, "result_output");
@@ -1607,6 +1632,10 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
+#if GGML_USE_MPI
+    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
@@ -1635,6 +1664,15 @@ static bool llama_eval_internal(
     ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
 #endif
 
+#if GGML_USE_MPI
+    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
+    // update kv token count
+    lctx.kv_self.n = n_past + N;
+
+    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+
     if (cgraph_fname) {
         ggml_graph_export(&gf, cgraph_fname);
     }
@@ -1650,23 +1688,17 @@ static bool llama_eval_internal(
     //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
     //}
 
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
-
-    // update kv token count
-    lctx.kv_self.n = n_past + N;
-
     // extract logits
     {
         auto & logits_out = lctx.logits;
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -2697,6 +2729,18 @@ struct llama_context * llama_new_context_with_model(
     }
 #endif
 
+#ifdef GGML_USE_MPI
+    ctx->ctx_mpi = ggml_mpi_init();
+
+    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
+        while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
+
     return ctx;
 }
 
diff --git a/llama.h b/llama.h
index c1e7dab9f..686463aa2 100644
--- a/llama.h
+++ b/llama.h
@@ -158,7 +158,9 @@ extern "C" {
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend(bool numa);
+    LLAMA_API void llama_backend_init(bool numa);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free();
 
     LLAMA_API int64_t llama_time_us();
 
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 20abe7100..87fde1645 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -31,6 +31,8 @@ int main(int argc, char **argv) {
     llama_model * model;
     llama_context * ctx;
 
+    llama_backend_init(false);
+
     // load the vocab
     {
         auto lparams = llama_context_default_params();
@@ -97,5 +99,7 @@ int main(int argc, char **argv) {
     llama_free_model(model);
     llama_free(ctx);
 
+    llama_backend_free();
+
     return 0;
 }

From bbef28218fe827265716b66977719b9ee2b21165 Mon Sep 17 00:00:00 2001
From: LostRuins <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 11 Jul 2023 22:01:08 +0800
Subject: [PATCH 18/24] Possible solution to allow K-quants on models with
 n_vocab!=32000 (#2148)

* This allows LLAMA models that were previously incompatible with K quants to function mostly as normal. This happens when a model has a vocab != 32000, e.g 32001 which means it's not divisible by 256 or 64. Since the problematic dimensions only apply for `tok_embeddings.weight` and `output.weight` (dimentions 4096 x n_vocab), we can simply quantize these layers to Q8_0 whereas the majority of the hidden layers are still K-quanted since they have compatible dimensions.

* Fix indentation

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* As an alternative, to avoid failing on Metal due to lack of Q8_0 support, instead quantize tok_embeddings.weight to Q4_0 and retain output.weight as F16. This results in a net gain of about 55mb for a 7B model compared to previous approach, but should minimize adverse impact to model quality.

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ad7283faf..08ec21ab6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2454,15 +2454,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
+            bool convert_incompatible_tensor = false;
             if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
                 quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
                 int nx = tensor.ne.at(0);
                 int ny = tensor.ne.at(1);
                 if (nx % QK_K != 0 || ny % QK_K != 0) {
-                    fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
-                    fprintf(stderr, "This is required to be able to use k-quants for now!\n");
-                    fprintf(stderr, "========================================================================================\n\n");
-                    throw std::runtime_error("Unsupported tensor size encountered\n");
+                    fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
+                    convert_incompatible_tensor = true;
                 }
             }
             if (tensor.name == "output.weight") {
@@ -2490,6 +2489,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
             }
+            if (convert_incompatible_tensor) {
+                if (tensor.name == "output.weight") {
+                    new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+                    fprintf(stderr, "F16 will be used for this tensor instead.\n");
+                } else if (tensor.name == "tok_embeddings.weight") {
+                    new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+                    fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
+                } else {
+                    throw std::runtime_error("Unsupported tensor size encountered\n");
+                }
+            }
 #endif
 
             float * f32_data;

From 2347463201a9f4159ae95b737e1544dd300569c8 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Tue, 11 Jul 2023 22:37:01 +0800
Subject: [PATCH 19/24] Support using mmap when applying LoRA (#2095)

* Support using mmap when applying LoRA

* Fix Linux

* Update comment to reflect the support lora with mmap
---
 examples/common.cpp        | 3 +--
 examples/main/README.md    | 2 +-
 examples/server/README.md  | 2 +-
 examples/server/server.cpp | 3 +--
 llama-util.h               | 6 +++---
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 93159c6df..fad16887d 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_adapter = argv[i];
-            params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
diff --git a/examples/main/README.md b/examples/main/README.md
index 375386130..04b8d5404 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/server/README.md b/examples/server/README.md
index ad9b6bb08..3691abd74 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -16,7 +16,7 @@ Command line options:
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 296c5d646..4114343ff 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
     fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.lora_adapter = argv[i];
-            params.use_mmap = false;
         }
         else if (arg == "--lora-base")
         {
diff --git a/llama-util.h b/llama-util.h
index 042ebe43c..43b6f05ad 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -175,13 +175,13 @@ struct llama_mmap {
     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
         size = file->size;
         int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
+        int flags = MAP_PRIVATE;
         // prefetch/readahead impairs performance on NUMA systems
         if (numa) { prefetch = 0; }
 #ifdef __linux__
         if (prefetch) { flags |= MAP_POPULATE; }
 #endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
         if (addr == MAP_FAILED) {
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
         }
@@ -223,7 +223,7 @@ struct llama_mmap {
             throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
         }
 
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
         error = GetLastError();
         CloseHandle(hMapping);
 

From 917831c63a4138814d23da1917bf2b5d5b9faa6c Mon Sep 17 00:00:00 2001
From: Chad Brewbaker <crb002@gmail.com>
Date: Tue, 11 Jul 2023 11:03:06 -0500
Subject: [PATCH 20/24] readme : fix zig build instructions (#2171)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 63457b6ae..476cc438b 100644
--- a/README.md
+++ b/README.md
@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
 - Using `Zig`:
 
     ```bash
-    zig build -Drelease-fast
+    zig build -Doptimize=ReleaseFast
     ```
 
 ### Metal Build

From 3ec7e596b2ba3f43c22f441254ca2bcfa91102ba Mon Sep 17 00:00:00 2001
From: Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
Date: Wed, 12 Jul 2023 01:12:35 +0900
Subject: [PATCH 21/24] docker : add '--server' option (#2174)

---
 .devops/tools.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index 860a7e891..efdd6663c 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -26,6 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
             ./quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
+elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then
+    ./server $arg2
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
@@ -37,4 +39,6 @@ else
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
+    echo "  --server (-s): Run a model on the server"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
 fi

From c9c74b4e3f9dcfab8b0032749ff8a579ab4e4d8d Mon Sep 17 00:00:00 2001
From: Bach Le <bach@bullno1.com>
Date: Wed, 12 Jul 2023 00:18:43 +0800
Subject: [PATCH 22/24] llama : add classifier-free guidance (#2135)

* Initial implementation

* Remove debug print

* Restore signature of llama_init_from_gpt_params

* Free guidance context

* Make freeing of guidance_ctx conditional

* Make Classifier-Free Guidance a sampling function

* Correct typo. CFG already means context-free grammar.

* Record sampling time in llama_sample_classifier_free_guidance

* Shift all values by the max value before applying logsoftmax

* Fix styling based on review
---
 examples/common.cpp    | 30 +++++++++++++-
 examples/common.h      |  7 ++++
 examples/main/main.cpp | 88 ++++++++++++++++++++++++++++++++++++++++--
 llama.cpp              | 56 +++++++++++++++++++++++++++
 llama.h                | 12 ++++++
 5 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index fad16887d..fd551c9cb 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -236,6 +236,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.mirostat_tau = std::stof(argv[i]);
+        } else if (arg == "--cfg-negative-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--cfg-smooth-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_smooth_factor = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -469,6 +487,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
     fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
     fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stderr, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stderr, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
@@ -535,7 +557,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
     return res;
 }
 
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto lparams = llama_context_default_params();
 
     lparams.n_ctx        = params.n_ctx;
@@ -551,6 +573,12 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
 
+    return lparams;
+}
+
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+    auto lparams = llama_context_params_from_gpt_params(params);
+
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/examples/common.h b/examples/common.h
index 96f2228f8..6315df961 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -48,6 +48,12 @@ struct gpt_params {
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
 
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;       // string to help guidance
+    float       cfg_scale         = 1.f;   // How strong is guidance
+    float       cfg_smooth_factor = 1.f;   // Smooth factor between old and new logits
+
     std::string model             = "models/7B/ggml-model.bin"; // model path
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
@@ -99,6 +105,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 //
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
 //
 // Console utils
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 07d8fc6ac..2248c2458 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -109,10 +109,16 @@ int main(int argc, char ** argv) {
 
     llama_model * model;
     llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }
+
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
@@ -183,15 +189,28 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        // Add a space in front of the first character to match OG llama tokenizer behavior
-        params.prompt.insert(0, 1, ' ');
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
 
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         embd_inp = ::llama_tokenize(ctx, params.prompt, true);
     } else {
         embd_inp = session_tokens;
     }
 
+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        params.cfg_negative_prompt.insert(0, 1, ' ');
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+    }
+
     const int n_ctx = llama_n_ctx(ctx);
 
     if ((int) embd_inp.size() > n_ctx - 4) {
@@ -258,6 +277,16 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < (int) embd_inp.size(); i++) {
             fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
         }
+
+        if (ctx_guidance) {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+            }
+        }
+
         if (params.n_keep > 0) {
         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
@@ -334,11 +363,13 @@ int main(int argc, char ** argv) {
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;
+    int n_past_guidance    = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console_set_color(con_st, CONSOLE_COLOR_PROMPT);
 
     std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;
 
     // do one empty run to warm up the model
     {
@@ -367,11 +398,12 @@ int main(int argc, char ** argv) {
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                 const int n_left = n_past - params.n_keep;
 
                 // always keep the first token - BOS
                 n_past = std::max(1, params.n_keep);
+                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
 
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -412,6 +444,48 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token* input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf = embd_guidance.data();
+                    input_size = embd_guidance.size();
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
+                } else {
+                    input_buf = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
@@ -431,6 +505,7 @@ int main(int argc, char ** argv) {
         }
 
         embd.clear();
+        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
@@ -473,6 +548,10 @@ int main(int argc, char ** argv) {
 
                 llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
+                }
+
                 // Apply penalties
                 float nl_logit = logits[llama_token_nl()];
                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -668,6 +747,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_print_timings(ctx);
+    if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);
 
diff --git a/llama.cpp b/llama.cpp
index 08ec21ab6..2d09d6ce7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2167,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
     }
 }
 
+static void llama_log_softmax(float * array, size_t size) {
+    float max_l = *std::max_element(array, array + size);
+    float sum = 0.f;
+    for (size_t i = 0; i < size; ++i) {
+        float p = expf(array[i] - max_l);
+        sum += p;
+        array[i] = p;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        array[i] = logf(array[i] / sum);
+    }
+}
+
+void llama_sample_classifier_free_guidance(
+          struct llama_context * ctx,
+        llama_token_data_array * candidates,
+          struct llama_context * guidance_ctx,
+                         float   scale,
+                         float   smooth_factor) {
+    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+
+    assert(ctx);
+    auto n_vocab = llama_n_vocab(ctx);
+    assert(n_vocab == (int)candidates->size);
+    assert(!candidates->sorted);
+
+    std::vector<float> logits_base;
+    logits_base.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        logits_base.push_back(candidates->data[i].logit);
+    }
+    llama_log_softmax(logits_base.data(), candidates->size);
+
+    float* logits_guidance = llama_get_logits(guidance_ctx);
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        float logit_guidance = logits_guidance[i];
+        float logit_base = logits_base[i];
+        logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
+    }
+
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        float logit_base = logits_base[i];
+        float logit_guidance = logits_guidance[i];
+
+        candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
+    }
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
 
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
     assert(ctx);
diff --git a/llama.h b/llama.h
index 686463aa2..4596b1ed4 100644
--- a/llama.h
+++ b/llama.h
@@ -309,6 +309,18 @@ extern "C" {
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+    LLAMA_API void llama_sample_classifier_free_guidance(
+              struct llama_context * ctx,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
+                             float   scale,
+                             float   smooth_factor);
+
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 

From 5bf2a2771886ee86137e01dbc7492f78fb392066 Mon Sep 17 00:00:00 2001
From: Spencer Sutton <spencersutton@users.noreply.github.com>
Date: Tue, 11 Jul 2023 12:31:10 -0400
Subject: [PATCH 23/24] ggml : remove src0 and src1 from ggml_tensor and rename
 opt to src (#2178)

* Add ggml changes

* Update train-text-from-scratch for change

* mpi : adapt to new ggml_tensor->src

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .../train-text-from-scratch.cpp               |  14 +-
 ggml-cuda.cu                                  |  28 +-
 ggml-metal.m                                  |   4 +-
 ggml-mpi.c                                    |   8 +-
 ggml.c                                        | 728 +++++++++---------
 ggml.h                                        |  10 +-
 6 files changed, 371 insertions(+), 421 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b96fdcdc4..afbb4a777 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1354,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
         }
     }
 
-    if (t->src0) {
-        expand(g, t->src0);
-    }
-
-    if (t->src1) {
-        expand(g, t->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (t->opt[i]) {
-            expand(g, t->opt[i]);
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (t->src[i]) {
+            expand(g, t->src[i]);
         }
     }
 
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index fd36f179b..1673e7e4c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3200,36 +3200,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
     }
 
     // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src0->op;
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
         if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
-            ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
         }
     }
-    if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
     }
 
     tensor->backend = GGML_BACKEND_GPU;
     struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
 
-    const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
         force_inplace;
     const size_t size = ggml_nbytes(tensor);
 
     CUDA_CHECK(cudaSetDevice(g_main_device));
-    if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
+            memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
         }
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
+        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
         extra->data_device[g_main_device] = src1_ddv;
     } else if (scratch) {
@@ -3300,8 +3300,8 @@ void ggml_cuda_free_scratch() {
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
 
     switch (tensor->op) {
         case GGML_OP_ADD:
@@ -3329,7 +3329,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
@@ -3383,6 +3383,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return true;
     }
-    func(tensor->src0, tensor->src1, tensor);
+    func(tensor->src[0], tensor->src[1], tensor);
     return true;
 }
diff --git a/ggml-metal.m b/ggml-metal.m
index 6473644c2..d7a16936c 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -393,8 +393,8 @@ void ggml_metal_graph_compute(
             for (int i = node_start; i < node_end; ++i) {
                 metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
-                struct ggml_tensor * src0 = gf->nodes[i]->src0;
-                struct ggml_tensor * src1 = gf->nodes[i]->src1;
+                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
                 struct ggml_tensor * dst  = gf->nodes[i];
 
                 const int64_t  ne00 = src0 ? src0->ne[0] : 0;
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 872e808de..ae176d707 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -175,11 +175,11 @@ void ggml_mpi_graph_compute_pre(
         // attach the input data to all nodes that need it
         // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
         for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src0 =  inp0;
+            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[0] =  inp0;
             }
-            if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src1 =  inp0;
+            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[1] =  inp0;
             }
         }
 
diff --git a/ggml.c b/ggml.c
index c10877a76..8dc30a372 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4584,9 +4584,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.op           =*/ GGML_OP_NONE,
         /*.is_param     =*/ false,
         /*.grad         =*/ NULL,
-        /*.src0         =*/ NULL,
-        /*.src1         =*/ NULL,
-        /*.opt          =*/ { NULL },
+        /*.src          =*/ { NULL },
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -5012,8 +5010,8 @@ struct ggml_tensor * ggml_dup_impl(
 
     result->op   = GGML_OP_DUP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5049,8 +5047,8 @@ struct ggml_tensor * ggml_add_impl(
 
     result->op   = GGML_OP_ADD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5089,8 +5087,8 @@ struct ggml_tensor * ggml_add1_impl(
 
     result->op   = GGML_OP_ADD1;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5147,9 +5145,9 @@ struct ggml_tensor * ggml_acc_impl(
 
     result->op   = GGML_OP_ACC;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -5195,8 +5193,8 @@ struct ggml_tensor * ggml_sub_impl(
 
     result->op   = GGML_OP_SUB;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5242,8 +5240,8 @@ struct ggml_tensor * ggml_mul_impl(
 
     result->op   = GGML_OP_MUL;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5285,8 +5283,8 @@ struct ggml_tensor * ggml_div_impl(
 
     result->op   = GGML_OP_DIV;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5321,8 +5319,8 @@ struct ggml_tensor * ggml_sqr_impl(
 
     result->op   = GGML_OP_SQR;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5355,8 +5353,8 @@ struct ggml_tensor * ggml_sqrt_impl(
 
     result->op   = GGML_OP_SQRT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5390,8 +5388,8 @@ struct ggml_tensor * ggml_log_impl(
 
     result->op   = GGML_OP_LOG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5423,8 +5421,8 @@ struct ggml_tensor * ggml_sum(
 
     result->op   = GGML_OP_SUM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5450,8 +5448,8 @@ struct ggml_tensor * ggml_sum_rows(
 
     result->op   = GGML_OP_SUM_ROWS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5473,8 +5471,8 @@ struct ggml_tensor * ggml_mean(
 
     result->op   = GGML_OP_MEAN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5497,8 +5495,8 @@ struct ggml_tensor * ggml_argmax(
 
     result->op   = GGML_OP_ARGMAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5525,8 +5523,8 @@ struct ggml_tensor * ggml_repeat(
 
     result->op   = GGML_OP_REPEAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5553,8 +5551,8 @@ struct ggml_tensor * ggml_repeat_back(
 
     result->op   = GGML_OP_REPEAT_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5575,8 +5573,8 @@ struct ggml_tensor * ggml_abs_impl(
 
     result->op   = GGML_OP_ABS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5610,8 +5608,8 @@ struct ggml_tensor * ggml_sgn_impl(
 
     result->op   = GGML_OP_SGN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5644,8 +5642,8 @@ struct ggml_tensor * ggml_neg_impl(
 
     result->op   = GGML_OP_NEG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5678,8 +5676,8 @@ struct ggml_tensor * ggml_step_impl(
 
     result->op   = GGML_OP_STEP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5712,8 +5710,8 @@ struct ggml_tensor * ggml_tanh_impl(
 
     result->op   = GGML_OP_TANH;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5746,8 +5744,8 @@ struct ggml_tensor * ggml_elu_impl(
 
     result->op   = GGML_OP_ELU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5780,8 +5778,8 @@ struct ggml_tensor * ggml_relu_impl(
 
     result->op   = GGML_OP_RELU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5814,8 +5812,8 @@ struct ggml_tensor * ggml_gelu_impl(
 
     result->op   = GGML_OP_GELU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5848,8 +5846,8 @@ struct ggml_tensor * ggml_gelu_quick_impl(
 
     result->op   = GGML_OP_GELU_QUICK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5882,8 +5880,8 @@ struct ggml_tensor * ggml_silu_impl(
 
     result->op   = GGML_OP_SILU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5917,8 +5915,8 @@ struct ggml_tensor * ggml_silu_back(
 
     result->op   = GGML_OP_SILU_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5940,8 +5938,8 @@ struct ggml_tensor * ggml_norm_impl(
 
     result->op   = GGML_OP_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
+    result->src[0] = a;
+    result->src[1] = NULL; // TODO: maybe store epsilon here?
 
     return result;
 }
@@ -5972,8 +5970,8 @@ struct ggml_tensor * ggml_rms_norm_impl(
 
     result->op   = GGML_OP_RMS_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
+    result->src[0] = a;
+    result->src[1] = NULL; // TODO: maybe store epsilon here?
 
     return result;
 }
@@ -6005,8 +6003,8 @@ struct ggml_tensor * ggml_rms_norm_back(
 
     result->op   = GGML_OP_RMS_NORM_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6032,8 +6030,8 @@ struct ggml_tensor * ggml_mul_mat(
 
     result->op   = GGML_OP_MUL_MAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6058,8 +6056,8 @@ struct ggml_tensor * ggml_out_prod(
 
     result->op   = GGML_OP_OUT_PROD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6084,8 +6082,8 @@ struct ggml_tensor * ggml_scale_impl(
 
     result->op   = GGML_OP_SCALE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6140,9 +6138,9 @@ struct ggml_tensor * ggml_set_impl(
 
     result->op   = GGML_OP_SET;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -6229,8 +6227,8 @@ struct ggml_tensor * ggml_cpy_impl(
 
     result->op   = GGML_OP_CPY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6266,8 +6264,8 @@ struct ggml_tensor * ggml_cont_impl(
 
     result->op   = GGML_OP_CONT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6310,8 +6308,8 @@ struct ggml_tensor * ggml_reshape(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6335,8 +6333,8 @@ struct ggml_tensor * ggml_reshape_1d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6361,8 +6359,8 @@ struct ggml_tensor * ggml_reshape_2d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6388,8 +6386,8 @@ struct ggml_tensor * ggml_reshape_3d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6417,8 +6415,8 @@ struct ggml_tensor * ggml_reshape_4d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6450,9 +6448,9 @@ struct ggml_tensor * ggml_view_1d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6492,9 +6490,9 @@ struct ggml_tensor * ggml_view_2d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6536,9 +6534,9 @@ struct ggml_tensor * ggml_view_3d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6582,9 +6580,9 @@ struct ggml_tensor * ggml_view_4d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6644,8 +6642,8 @@ struct ggml_tensor * ggml_permute(
 
     result->op   = GGML_OP_PERMUTE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     if (is_node) {
         ggml_scratch_save(ctx);
@@ -6659,7 +6657,7 @@ struct ggml_tensor * ggml_permute(
 
         ggml_scratch_load(ctx);
 
-        result->opt[0] = b;
+        result->src[2] = b;
     }
 
     return result;
@@ -6687,8 +6685,8 @@ struct ggml_tensor * ggml_transpose(
 
     result->op   = GGML_OP_TRANSPOSE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6713,8 +6711,8 @@ struct ggml_tensor * ggml_get_rows(
 
     result->op   = GGML_OP_GET_ROWS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6741,9 +6739,9 @@ struct ggml_tensor * ggml_get_rows_back(
 
     result->op   = GGML_OP_GET_ROWS_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -6765,8 +6763,8 @@ struct ggml_tensor * ggml_diag(
 
     result->op   = GGML_OP_DIAG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6798,8 +6796,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
 
     result->op   = GGML_OP_DIAG_MASK_INF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6846,8 +6844,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
 
     result->op   = GGML_OP_DIAG_MASK_ZERO;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6882,8 +6880,8 @@ struct ggml_tensor * ggml_soft_max_impl(
 
     result->op   = GGML_OP_SOFT_MAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6918,8 +6916,8 @@ struct ggml_tensor * ggml_soft_max_back_impl(
 
     result->op   = GGML_OP_SOFT_MAX_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6970,8 +6968,8 @@ struct ggml_tensor * ggml_rope_impl(
 
     result->op   = GGML_OP_ROPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7028,8 +7026,8 @@ struct ggml_tensor * ggml_rope_back(
 
     result->op   = GGML_OP_ROPE_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7067,8 +7065,8 @@ struct ggml_tensor * ggml_alibi(
 
     result->op   = GGML_OP_ALIBI;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7101,8 +7099,8 @@ struct ggml_tensor * ggml_clamp(
 
     result->op   = GGML_OP_CLAMP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7144,9 +7142,9 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
 
     result->op = GGML_OP_CONV_1D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -7192,9 +7190,9 @@ struct ggml_tensor* ggml_conv_2d(
 
     result->op = GGML_OP_CONV_2D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 
@@ -7233,10 +7231,10 @@ struct ggml_tensor * ggml_flash_attn(
 
     result->op   = GGML_OP_FLASH_ATTN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
 
     return result;
 }
@@ -7264,11 +7262,11 @@ struct ggml_tensor * ggml_flash_ff(
 
     result->op   = GGML_OP_FLASH_FF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b0;
-    result->opt[0] = b1;
-    result->opt[1] = c0;
-    result->opt[2] = c1;
+    result->src[0] = a;
+    result->src[1] = b0;
+    result->src[2] = b1;
+    result->src[3] = c0;
+    result->src[4] = c1;
 
     return result;
 }
@@ -7328,11 +7326,11 @@ struct ggml_tensor * ggml_flash_attn_back(
 
     result->op   = GGML_OP_FLASH_ATTN_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = d;
-    result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0);
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = d;
+    result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
 
     return result;
 }
@@ -7377,9 +7375,9 @@ struct ggml_tensor * ggml_win_part(
 
     result->op   = GGML_OP_WIN_PART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = b;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = b;
 
     return result;
 }
@@ -7414,9 +7412,9 @@ struct ggml_tensor * ggml_win_unpart(
 
     result->op   = GGML_OP_WIN_UNPART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = b;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = b;
 
     return result;
 }
@@ -7445,8 +7443,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
 
     result->op = GGML_OP_MAP_UNARY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7492,9 +7490,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
 
     result->op = GGML_OP_MAP_BINARY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7539,8 +7537,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM1;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7584,9 +7582,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM2;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7633,10 +7631,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM3;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
-    result->opt[1] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
+    result->src[3] = c;
 
     return result;
 }
@@ -7676,8 +7674,8 @@ struct ggml_tensor * ggml_cross_entropy_loss(
 
     result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7696,9 +7694,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
 
     result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
     result->grad = NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -14567,287 +14565,287 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     if (skip_cpu) {
         return;
     }
-    GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS
 
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
-                ggml_compute_forward_dup(params, tensor->src0, tensor);
+                ggml_compute_forward_dup(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ADD:
             {
-                ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ADD1:
             {
-                ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ACC:
             {
-                ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_SUB:
             {
-                ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_MUL:
             {
-                ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_DIV:
             {
-                ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SQR:
             {
-                ggml_compute_forward_sqr(params, tensor->src0, tensor);
+                ggml_compute_forward_sqr(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SQRT:
             {
-                ggml_compute_forward_sqrt(params, tensor->src0, tensor);
+                ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_LOG:
             {
-                ggml_compute_forward_log(params, tensor->src0, tensor);
+                ggml_compute_forward_log(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SUM:
             {
-                ggml_compute_forward_sum(params, tensor->src0, tensor);
+                ggml_compute_forward_sum(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SUM_ROWS:
             {
-                ggml_compute_forward_sum_rows(params, tensor->src0, tensor);
+                ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_MEAN:
             {
-                ggml_compute_forward_mean(params, tensor->src0, tensor);
+                ggml_compute_forward_mean(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ARGMAX:
             {
-                ggml_compute_forward_argmax(params, tensor->src0, tensor);
+                ggml_compute_forward_argmax(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_REPEAT:
             {
-                ggml_compute_forward_repeat(params, tensor->src0, tensor);
+                ggml_compute_forward_repeat(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_REPEAT_BACK:
             {
-                ggml_compute_forward_repeat_back(params, tensor->src0, tensor);
+                ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ABS:
             {
-                ggml_compute_forward_abs(params, tensor->src0, tensor);
+                ggml_compute_forward_abs(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SGN:
             {
-                ggml_compute_forward_sgn(params, tensor->src0, tensor);
+                ggml_compute_forward_sgn(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_NEG:
             {
-                ggml_compute_forward_neg(params, tensor->src0, tensor);
+                ggml_compute_forward_neg(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_STEP:
             {
-                ggml_compute_forward_step(params, tensor->src0, tensor);
+                ggml_compute_forward_step(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_TANH:
             {
-                ggml_compute_forward_tanh(params, tensor->src0, tensor);
+                ggml_compute_forward_tanh(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ELU:
             {
-                ggml_compute_forward_elu(params, tensor->src0, tensor);
+                ggml_compute_forward_elu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RELU:
             {
-                ggml_compute_forward_relu(params, tensor->src0, tensor);
+                ggml_compute_forward_relu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_GELU:
             {
-                ggml_compute_forward_gelu(params, tensor->src0, tensor);
+                ggml_compute_forward_gelu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_GELU_QUICK:
             {
-                ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
+                ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SILU:
             {
-                ggml_compute_forward_silu(params, tensor->src0, tensor);
+                ggml_compute_forward_silu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SILU_BACK:
             {
-                ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_NORM:
             {
-                ggml_compute_forward_norm(params, tensor->src0, tensor);
+                ggml_compute_forward_norm(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RMS_NORM:
             {
-                ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
+                ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
-                ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_MUL_MAT:
             {
-                ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_OUT_PROD:
             {
-                ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SCALE:
             {
-                ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SET:
             {
-                ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_CPY:
             {
-                ggml_compute_forward_cpy(params, tensor->src0, tensor);
+                ggml_compute_forward_cpy(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_CONT:
             {
-                ggml_compute_forward_cont(params, tensor->src0, tensor);
+                ggml_compute_forward_cont(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RESHAPE:
             {
-                ggml_compute_forward_reshape(params, tensor->src0, tensor);
+                ggml_compute_forward_reshape(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_VIEW:
             {
-                ggml_compute_forward_view(params, tensor->src0);
+                ggml_compute_forward_view(params, tensor->src[0]);
             } break;
         case GGML_OP_PERMUTE:
             {
-                ggml_compute_forward_permute(params, tensor->src0);
+                ggml_compute_forward_permute(params, tensor->src[0]);
             } break;
         case GGML_OP_TRANSPOSE:
             {
-                ggml_compute_forward_transpose(params, tensor->src0);
+                ggml_compute_forward_transpose(params, tensor->src[0]);
             } break;
         case GGML_OP_GET_ROWS:
             {
-                ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
-                ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_DIAG:
             {
-                ggml_compute_forward_diag(params, tensor->src0, tensor);
+                ggml_compute_forward_diag(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_DIAG_MASK_INF:
             {
-                ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
-                ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                ggml_compute_forward_soft_max(params, tensor->src0, tensor);
+                ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
-                ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ROPE:
             {
-                ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ROPE_BACK:
             {
-                ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ALIBI:
             {
-                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_CLAMP:
             {
-                ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_CONV_1D:
             {
-                ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_CONV_2D:
             {
-                ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_FLASH_ATTN:
             {
-                const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+                const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 const bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
+                ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
             } break;
         case GGML_OP_FLASH_FF:
             {
-                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
+                ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
-                int32_t t = ggml_get_i32_1d(tensor->opt[2], 0);
+                int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 bool masked = t != 0;
-                ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
+                ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
             } break;
         case GGML_OP_WIN_PART:
             {
-                ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
+                ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
             } break;
         case GGML_OP_WIN_UNPART:
             {
-                ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
+                ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
             } break;
         case GGML_OP_MAP_UNARY:
             {
-                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
+                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
             }
             break;
         case GGML_OP_MAP_BINARY:
             {
-                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
+                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM1:
             {
-                const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
+                const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM2:
             {
-                const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
+                const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM3:
             {
-                const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
+                const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
-                ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
-                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             }
             break;
         case GGML_OP_NONE:
@@ -14864,8 +14862,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 ////////////////////////////////////////////////////////////////////////////////
 
 static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
-    struct ggml_tensor * src0 = tensor->src0;
-    struct ggml_tensor * src1 = tensor->src1;
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
 
     switch (tensor->op) {
         case GGML_OP_DUP:
@@ -14901,12 +14899,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
                 }
                 if (src1->grad) {
-                    GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
-                    GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
-                    const size_t nb1     = (( int32_t * ) tensor->opt[0]->data)[0];
-                    const size_t nb2     = (( int32_t * ) tensor->opt[0]->data)[1];
-                    const size_t nb3     = (( int32_t * ) tensor->opt[0]->data)[2];
-                    const size_t offset  = (( int32_t * ) tensor->opt[0]->data)[3];
+                    GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
+                    GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
+                    const size_t nb1     = (( int32_t * ) tensor->src[2]->data)[0];
+                    const size_t nb2     = (( int32_t * ) tensor->src[2]->data)[1];
+                    const size_t nb3     = (( int32_t * ) tensor->src[2]->data)[2];
+                    const size_t offset  = (( int32_t * ) tensor->src[2]->data)[3];
 
                     struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
                         tensor->grad,
@@ -15214,12 +15212,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_SET:
             {
-                GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
-                GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
-                const size_t nb1     = (( int32_t * ) tensor->opt[0]->data)[0];
-                const size_t nb2     = (( int32_t * ) tensor->opt[0]->data)[1];
-                const size_t nb3     = (( int32_t * ) tensor->opt[0]->data)[2];
-                const size_t offset  = (( int32_t * ) tensor->opt[0]->data)[3];
+                GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
+                GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
+                const size_t nb1     = (( int32_t * ) tensor->src[2]->data)[0];
+                const size_t nb2     = (( int32_t * ) tensor->src[2]->data)[1];
+                const size_t nb3     = (( int32_t * ) tensor->src[2]->data)[2];
+                const size_t offset  = (( int32_t * ) tensor->src[2]->data)[3];
 
                 struct ggml_tensor * tensor_grad_view = NULL;
 
@@ -15296,8 +15294,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     size_t offset;
 
-                    GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0]));
-                    memcpy(&offset, tensor->opt[0]->data, sizeof(offset));
+                    GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
+                    memcpy(&offset, tensor->src[2]->data, sizeof(offset));
 
                     size_t nb1     = tensor->nb[1];
                     size_t nb2     = tensor->nb[2];
@@ -15324,7 +15322,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    int32_t * axes = (int32_t *) tensor->opt[0]->data;
+                    int32_t * axes = (int32_t *) tensor->src[2]->data;
                     int axis0 = axes[0] & 0x3;
                     int axis1 = axes[1] & 0x3;
                     int axis2 = axes[2] & 0x3;
@@ -15487,15 +15485,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_FLASH_ATTN:
             {
                 struct ggml_tensor * flash_grad = NULL;
-                if (src0->grad || src1->grad || tensor->opt[0]->grad) {
-                    int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+                if (src0->grad || src1->grad || tensor->src[2]->grad) {
+                    int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
                     GGML_ASSERT(t == 0 || t == 1);
                     bool masked = t != 0;
                     flash_grad =
                         ggml_flash_attn_back(ctx,
                             src0,
                             src1,
-                            tensor->opt[0],
+                            tensor->src[2],
                             tensor->grad,
                             masked);
                 }
@@ -15592,7 +15590,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             inplace);
                 }
 
-                struct ggml_tensor * opt0 = tensor->opt[0];
+                struct ggml_tensor * opt0 = tensor->src[2];
 
                 if (opt0->grad) {
                     struct ggml_tensor * grad_v = NULL;
@@ -15708,17 +15706,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
         }
     }
 
-    if (node->src0) {
-        ggml_visit_parents(cgraph, node->src0);
-    }
-
-    if (node->src1) {
-        ggml_visit_parents(cgraph, node->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (node->opt[i]) {
-            ggml_visit_parents(cgraph, node->opt[i]);
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (node->src[i]) {
+            ggml_visit_parents(cgraph, node->src[i]);
         }
     }
 
@@ -16110,8 +16100,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    if (ggml_is_quantized(node->src0->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16122,8 +16112,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    if (ggml_is_quantized(node->src0->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16166,39 +16156,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     n_tasks = n_threads;
 
                     // TODO: use different scheduling for different matrix sizes
-                    //const int nr0 = ggml_nrows(node->src0);
-                    //const int nr1 = ggml_nrows(node->src1);
+                    //const int nr0 = ggml_nrows(node->src[0]);
+                    //const int nr1 = ggml_nrows(node->src[1]);
 
                     //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                     //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
 
                     size_t cur = 0;
-                    const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
+                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 
 #if defined(GGML_USE_CUBLAS)
-                    if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
+                    if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
                         n_tasks = 1; // TODO: this actually is doing nothing
                                      //       the threads are still spinning
                     } else
 #elif defined(GGML_USE_CLBLAST)
-                    if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
                         n_tasks = 1; // TODO: this actually is doing nothing
                                      //       the threads are still spinning
-                        cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
                     } else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
                         n_tasks = 1; // TODO: this actually is doing nothing
                                      //       the threads are still spinning
-                        if (node->src0->type != GGML_TYPE_F32) {
+                        if (node->src[0]->type != GGML_TYPE_F32) {
                             // here we need memory just for single 2D matrix from src0
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
                         }
                     } else
 #endif
-                    if (node->src1->type != vec_dot_type) {
-                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
+                    if (node->src[1]->type != vec_dot_type) {
+                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
                     } else {
                         cur = 0;
                     }
@@ -16242,24 +16232,24 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = n_threads;
 
-                    GGML_ASSERT(node->src0->ne[3] == 1);
-                    GGML_ASSERT(node->src1->ne[2] == 1);
-                    GGML_ASSERT(node->src1->ne[3] == 1);
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
 
                     size_t cur = 0;
-                    const int nk = node->src0->ne[0];
+                    const int nk = node->src[0]->ne[0];
 
-                    if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(ggml_fp16_t)*(
-                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
                                 );
-                    } else if (node->src0->type == GGML_TYPE_F32 &&
-                            node->src1->type == GGML_TYPE_F32) {
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(float)*(
-                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
                                 );
                     } else {
                         GGML_ASSERT(false);
@@ -16271,16 +16261,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = n_threads;
 
-                    GGML_ASSERT(node->src1->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
 
-                    const int64_t ne00 = node->src0->ne[0]; // W
-                    const int64_t ne01 = node->src0->ne[1]; // H
-                    const int64_t ne02 = node->src0->ne[2]; // C
-                    const int64_t ne03 = node->src0->ne[3]; // N
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // C
+                    const int64_t ne03 = node->src[0]->ne[3]; // N
 
-                    const int64_t ne10 = node->src1->ne[0]; // W
-                    const int64_t ne11 = node->src1->ne[1]; // H
-                    const int64_t ne12 = node->src1->ne[2]; // C
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // C
 
                     const int64_t nk = ne00*ne01;
 
@@ -16290,11 +16280,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
-                    } else if (node->src0->type == GGML_TYPE_F32 &&
-                            node->src1->type == GGML_TYPE_F32) {
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(float)*      (ne10*ne11*ne12);
                     } else {
                         GGML_ASSERT(false);
@@ -16308,14 +16298,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
 
-                    if (node->src1->type == GGML_TYPE_F32) {
+                    if (node->src[1]->type == GGML_TYPE_F32) {
                         cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                     }
 
-                    if (node->src1->type == GGML_TYPE_F16) {
+                    if (node->src[1]->type == GGML_TYPE_F16) {
                         cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                     }
@@ -16328,14 +16318,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    if (node->src1->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
                     }
 
-                    if (node->src1->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16346,15 +16336,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
 
-                    const int64_t    D = node->src0->ne[0];
-                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t    D = node->src[0]->ne[0];
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
                     const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                    if (node->src1->type == GGML_TYPE_F32) {
+                    if (node->src[1]->type == GGML_TYPE_F32) {
                         cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                     }
 
-                    if (node->src1->type == GGML_TYPE_F16) {
+                    if (node->src[1]->type == GGML_TYPE_F16) {
                         cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                     }
@@ -16375,7 +16365,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = n_threads;
 
-                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks);
+                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
 
                     work_size = MAX(work_size, cur);
                 } break;
@@ -16383,7 +16373,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = n_threads;
 
-                    size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
+                    size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
 
                     work_size = MAX(work_size, cur);
                 } break;
@@ -16593,8 +16583,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
             ggml_graph_export_leaf(cgraph->leafs[i], fout);
 
             GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
         }
 
         // header
@@ -16605,17 +16595,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
         for (int i = 0; i < cgraph->n_nodes; ++i) {
             ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
 
-            if (cgraph->nodes[i]->src0) {
-                ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
-            }
-
-            if (cgraph->nodes[i]->src1) {
-                ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
-            }
-
-            for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                if (cgraph->nodes[i]->opt[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+            for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                if (cgraph->nodes[i]->src[j]) {
+                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
                 }
             }
 
@@ -16706,16 +16688,13 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
                 // output the op arguments
                 {
-                    struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
 
-                    args[0] = tensor->src0;
-                    args[1] = tensor->src1;
-
-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = tensor->opt[j];
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                        args[j] = tensor->src[j];
                     }
 
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
                         if (args[j]) {
                             int32_t idx = -1;
 
@@ -16933,12 +16912,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
 
                 const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
 
-                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
 
-                struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
 
                 // parse args
-                for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
                     const int32_t arg_idx = ptr_arg_idx[j];
 
                     if (arg_idx == -1) {
@@ -16995,11 +16974,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     tensor->nb[j] = nb[j];
                 }
 
-                tensor->src0 = args[0];
-                tensor->src1 = args[1];
-
-                for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                    tensor->opt[j] = args[2 + j];
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                    tensor->src[j] = args[j];
                 }
 
                 result.nodes[i] = tensor;
@@ -17198,19 +17174,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
 
-        if (node->src0) {
-            ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
-        }
-
-        if (node->src1) {
-            ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
-        }
-
-        for (int j = 0; j < GGML_MAX_OPT; j++) {
-            if (node->opt[j]) {
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
                 char label[16];
-                snprintf(label, sizeof(label), "opt %d", j);
-                ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
             }
         }
     }
@@ -17218,19 +17186,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
     for (int i = 0; i < gb->n_leafs; i++) {
         struct ggml_tensor * node = gb->leafs[i];
 
-        if (node->src0) {
-            ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
-        }
-
-        if (node->src1) {
-            ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
-        }
-
-        for (int j = 0; j < GGML_MAX_OPT; j++) {
-            if (node->opt[j]) {
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
                 char label[16];
-                snprintf(label, sizeof(label), "opt %d", j);
-                ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
             }
         }
     }
diff --git a/ggml.h b/ggml.h
index ab84bef68..d7c9e0f0e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -132,10 +132,10 @@
 //   {
 //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
 //
-//       // a[1, 2] = 1.0f;
+//       // a[2, 1] = 1.0f;
 //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
 //
-//       // a[2, 0] = 2.0f;
+//       // a[0, 2] = 2.0f;
 //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
 //
 //       ...
@@ -197,7 +197,7 @@
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_OPT           4
+#define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4
 
@@ -414,9 +414,7 @@ extern "C" {
         bool is_param;
 
         struct ggml_tensor * grad;
-        struct ggml_tensor * src0;
-        struct ggml_tensor * src1;
-        struct ggml_tensor * opt[GGML_MAX_OPT];
+        struct ggml_tensor * src[GGML_MAX_SRC];
 
         // performance
         int     perf_runs;

From 20d7740a9b45f6e5b247fa3738fdda35e18c2e8a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 11 Jul 2023 22:53:34 +0300
Subject: [PATCH 24/24] ggml : sync (abort callback, mul / add broadcast, fix
 alibi) (#2183)

---
 ggml-cuda.cu       | 115 ++++++++++++++++++++++++++++++++++-----------
 ggml.c             | 115 ++++++++++++++++++++++++++++-----------------
 ggml.h             |  11 ++++-
 tests/test-grad0.c |   2 +
 tests/test-opt.c   |   2 +
 5 files changed, 173 insertions(+), 72 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 1673e7e4c..2fb30c6e6 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -239,13 +239,13 @@ struct ggml_tensor_extra_gpu {
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };
 
-static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
-    if (i >= k) {
+    if (i >= kx) {
         return;
     }
-    dst[i] = x[i] + y[i];
+    dst[i] = x[i] + y[i%ky];
 }
 
 static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -275,16 +275,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
     dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
 
+static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const float eps = 1e-5f;
+
+    float mean = 0.0f;
+    float var = 0.0f;
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        const float xi = x[row*ncols + col];
+        mean += xi;
+        var += xi * xi;
+    }
+
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    }
+
+    mean /= ncols;
+    var = var / ncols - mean * mean;
+    const float inv_var = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
+    }
+}
+
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    const float eps = 1e-6;
+    const float eps = 1e-6f;
 
     float tmp = 0.0f; // partial sum for thread in warp
 
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
-        const int col = i + tid;
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
         const float xi = x[row*ncols + col];
         tmp += xi * xi;
     }
@@ -296,10 +326,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
     }
 
     const float mean = tmp / ncols;
-    const float scale = 1.0f / sqrtf(mean + eps);
+    const float scale = rsqrtf(mean + eps);
 
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
-        const int col = i + tid;
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
         dst[row*ncols + col] = scale * x[row*ncols + col];
     }
 }
@@ -1689,9 +1718,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
-static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
 
 static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1709,6 +1738,12 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
     silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 
+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     const dim3 block_dims(WARP_SIZE, 1, 1);
@@ -2239,14 +2274,16 @@ inline void ggml_cuda_op_add(
     GGML_ASSERT(src1_ddf_i != nullptr);
     GGML_ASSERT(dst_ddf_i != nullptr);
 
-    const int64_t ne0 = src0->ne[0];
+    const int64_t ne00 = src0->ne[0];
     const int64_t i01_diff = i01_high - i01_low;
 
+    const int64_t ne10 = src1->ne[0];
+
     // compute
     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10, cudaStream_main);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
     } else {
         GGML_ASSERT(false);
     }
@@ -2268,20 +2305,11 @@ inline void ggml_cuda_op_mul(
     GGML_ASSERT(dst_ddf_i != nullptr);
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
 
     const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
 
-    for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
-        const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
-
-        float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
-        float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
-        float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
-
-        // compute
-        mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
-    }
+    mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10, cudaStream_main);
 
     (void) dst;
     (void) src0_ddq_i;
@@ -2310,6 +2338,28 @@ inline void ggml_cuda_op_silu(
     (void) i1;
 }
 
+inline void ggml_cuda_op_norm(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+    cudaStream_t & cudaStream_main){
+
+    GGML_ASSERT(src0_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+
+    // compute
+    norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+
+    (void) src1;
+    (void) dst;
+    (void) src0_ddq_i;
+    (void) src1_ddf_i;
+    (void) i02;
+    (void) i1;
+}
+
 inline void ggml_cuda_op_rms_norm(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
     float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2930,6 +2980,11 @@ void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
 }
 
+void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
+}
+
 void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3160,7 +3215,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
         }
 
 
-        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
 
         extra->data_device[id] = buf;
 
@@ -3322,6 +3377,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             }
             func = ggml_cuda_silu;
             break;
+        case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_norm;
+            break;
         case GGML_OP_RMS_NORM:
             if (!any_on_device) {
                 return false;
diff --git a/ggml.c b/ggml.c
index 8dc30a372..793ff7095 100644
--- a/ggml.c
+++ b/ggml.c
@@ -25,6 +25,7 @@
 #include <float.h>
 #include <limits.h>
 #include <stdarg.h>
+#include <signal.h>
 
 #ifdef GGML_USE_METAL
 #include <unistd.h>
@@ -49,23 +50,23 @@
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 
-static void atomic_store(atomic_int* ptr, LONG val) {
+static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
-static LONG atomic_load(atomic_int* ptr) {
+static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
-static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
     return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
 typedef HANDLE pthread_t;
 
 typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
     if (handle == NULL)
@@ -77,7 +78,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
     return 0;
 }
 
-static int pthread_join(pthread_t thread, void* unused) {
+static int pthread_join(pthread_t thread, void * unused) {
     (void) unused;
     return (int) WaitForSingleObject(thread, INFINITE);
 }
@@ -90,7 +91,7 @@ static int sched_yield (void) {
 #include <pthread.h>
 #include <stdatomic.h>
 
-typedef void* thread_ret_t;
+typedef void * thread_ret_t;
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -4723,7 +4724,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_F32:
@@ -4775,7 +4776,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_F32:
@@ -5035,11 +5036,15 @@ struct ggml_tensor * ggml_add_impl(
         struct ggml_tensor * a,
         struct ggml_tensor * b,
         bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
 
     bool is_node = false;
 
-    if (a->grad || b->grad) {
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
         is_node = true;
     }
 
@@ -8297,7 +8302,7 @@ static void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -8322,23 +8327,23 @@ static void ggml_compute_forward_add_f32(
 
     if (nb10 == sizeof(float)) {
         for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
 #ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
+            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
 #else
-            ggml_vec_add_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+            ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
 #endif
                 // }
             // }
@@ -8346,15 +8351,20 @@ static void ggml_compute_forward_add_f32(
     } else {
         // src1 is not contiguous
         for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
             for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
             }
@@ -11717,7 +11727,7 @@ static void ggml_compute_forward_alibi_f32(
 
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
     //const int ne3 = src0->ne[3]; // 1 -> bsz
 
     const int n  = ggml_nrows(src0);
@@ -11728,8 +11738,9 @@ static void ggml_compute_forward_alibi_f32(
     const int nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
 
-    assert(nb0 == sizeof(float));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(ne1 + n_past == ne0);
+    GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -11753,7 +11764,7 @@ static void ggml_compute_forward_alibi_f32(
                     m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                 }
 
-                pdst[0] = (i-ne0+1) * m_k + src[0];
+                pdst[0] = i * m_k + src[0];
 
             }
         }
@@ -11782,7 +11793,7 @@ static void ggml_compute_forward_alibi_f16(
 
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
     //const int ne3 = src0->ne[3]; // 1 -> bsz
 
     const int n  = ggml_nrows(src0);
@@ -11793,8 +11804,9 @@ static void ggml_compute_forward_alibi_f16(
     const int nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
 
-    assert(nb0 == sizeof(ggml_fp16_t));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -11819,7 +11831,7 @@ static void ggml_compute_forward_alibi_f16(
                 }
 
                 // we return F32
-                pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
+                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
             }
         }
     }
@@ -15944,6 +15956,9 @@ struct ggml_compute_state_shared {
     // synchronization primitives
     atomic_int n_active; // num active threads
     atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
 };
 
 struct ggml_compute_state {
@@ -15975,6 +15990,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     int node_n = -1;
 
     while (true) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            state->shared->node_n += 1;
+            return (thread_ret_t) GGML_EXIT_ABORTED;
+        }
         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
             // all other threads are finished and spinning
             // do finalize and init here so we don't have synchronize again
@@ -16028,6 +16047,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 } else {
                     break;
                 }
+
+                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                    break;
+                }
             }
 
             atomic_store(&state->shared->n_active, n_threads);
@@ -16061,7 +16084,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
 
-    return 0;
+    return GGML_EXIT_SUCCESS;
 }
 
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
@@ -16401,7 +16424,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     return cplan;
 }
 
-void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     {
         GGML_ASSERT(cplan);
         GGML_ASSERT(cplan->n_threads > 0);
@@ -16427,6 +16450,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
         /*.n_threads               =*/ n_threads,
         /*.n_active                =*/ n_threads,
         /*.node_n                  =*/ -1,
+        /*.abort_callback          =*/ NULL,
+        /*.abort_callback_data     =*/ NULL,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
@@ -16450,12 +16475,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
     // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
 
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
-    // join thread pool
+    // join or kill thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; j++) {
             const int rc = ggml_thread_join(workers[j].thrd, NULL);
@@ -16479,6 +16504,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
                 (double) perf_time_us_cur     / 1000.0,
                 (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
     }
+
+    return compute_status;
 }
 
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
diff --git a/ggml.h b/ggml.h
index d7c9e0f0e..8fe05d3a5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -201,8 +201,13 @@
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4
 
+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
 #define GGML_UNUSED(x) (void)(x)
 
+
 #define GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
@@ -442,6 +447,10 @@ extern "C" {
 
         // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
         int n_tasks[GGML_MAX_NODES];
+
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
     };
 
     // computation graph
@@ -1303,7 +1312,7 @@ extern "C" {
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index da4001ce5..01467bc18 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,7 +10,9 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
 
 #define MAX_NARGS 3
 
diff --git a/tests/test-opt.c b/tests/test-opt.c
index e928a7df7..5531814c4 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,7 +7,9 @@
 
 #define MAX_NARGS 2
 
+#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
 
 //
 // logging