Merge branch 'master' into support-mamba-ssm

2024-03-05 12:12:01 -05:00 · 2024-03-05 12:12:01 -05:00 · 5544f5211b
commit 5544f5211b
parent 93fd4b8d5b bd836944f8
36 changed files with 46612 additions and 47293 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -1,5 +1,6 @@
 {
  lib,
  glibc,
  config,
  stdenv,
  mkShell,
@ -30,6 +31,11 @@
  useRocm ? config.rocmSupport,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic
 }@inputs:
 let
@ -41,10 +47,7 @@ let
    versionOlder
    ;
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
  suffices =
    lib.optionals useBlas [ "BLAS" ]
@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ]
      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
        glibc.static
      ];
    buildInputs =
@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
      [
        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" true)
+        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
--- a/README.md
+++ b/README.md
@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Recent API changes
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
 ### Hot topics
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
        endif()
    endif()
-    set(GIT_INDEX "${GIT_DIR}/index")
+    if(EXISTS "${GIT_DIR}/index")
        set(GIT_INDEX "${GIT_DIR}/index")
    else()
        message(WARNING "Git index not found in git repository.")
        set(GIT_INDEX "")
    endif()
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
--- a/common/common.cpp
+++ b/common/common.cpp
@ -513,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
        } else if (arg == "--p-accept" || arg == "-pa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.p_accept = std::stof(argv[i]);
        } else if (arg == "--p-split" || arg == "-ps") {
            if (++i >= argc) {
                invalid_param = true;
@ -1044,7 +1038,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
@ -1300,7 +1293,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.seed              = params.seed;
    cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
+    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
    cparams.rope_freq_scale   = params.rope_freq_scale;
--- a/common/common.h
+++ b/common/common.h
@ -43,7 +43,7 @@ extern char const *LLAMA_BUILD_TARGET;
 int32_t get_num_physical_cores();
 struct gpt_params {
-    uint32_t seed                 = -1;    // RNG seed
+    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
    int32_t n_threads             = get_num_physical_cores();
    int32_t n_threads_draft       = -1;
@ -53,11 +53,10 @@ struct gpt_params {
    int32_t n_ctx                 = 512;   // context size
    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding
+    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            = 1;     // number of parallel sequences to decode
    int32_t n_sequences           = 1;     // number of sequences to decode
    float   p_accept              = 0.5f;  // speculative decoding accept probability
    float   p_split               = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -295,6 +295,77 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }
 static llama_token_data_array llama_sample_probability_distribution_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
    const llama_sampling_params & params = ctx_sampling->params;
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
    const float   penalty_present = params.penalty_present;
    const bool    penalize_nl     = params.penalize_nl;
    auto & prev = ctx_sampling->prev;
    auto & cur  = ctx_sampling->cur;
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
    // Declare original_logits at the beginning of the function scope
    std::vector<float> original_logits;
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    if (ctx_cfg) {
        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
    cur.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
    // apply penalties
    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
    if (penalty_tokens_used_size) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
        llama_sample_repetition_penalties(ctx_main, &cur_p,
                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    // apply grammar checks
    if (ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }
    llama_sample_softmax(ctx_main, &cur_p);
    return cur_p;
 }
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
@ -304,6 +375,14 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }
 llama_token_data_array llama_sampling_probability_distribution(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
 }
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@ -131,6 +131,13 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);
 // returns the probability that token of given id will be sampled
 llama_token_data_array llama_sampling_probability_distribution(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
        int idx = 0);
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -36,8 +36,10 @@ class SentencePieceTokenTypes(IntEnum):
    UNUSED = 5
    BYTE = 6
 AnyModel = TypeVar("AnyModel", bound="type[Model]")
 class Model(ABC):
    _model_classes: dict[str, type[Model]] = {}
@ -187,6 +189,7 @@ class Model(ABC):
    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
        assert names
        def func(modelcls: type[Model]):
            for name in names:
                cls._model_classes[name] = modelcls
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -19,11 +19,11 @@ static std::vector<std::string> split_lines(const std::string & s) {
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
    }
 }
-static void normalize(float * vec, float * out, int n) {
+static void normalize(const float * vec, float * out, int n) {
    float norm = 0;
    for (int i = 0; i < n; i++) {
        norm += vec[i] * vec[i];
@ -45,10 +45,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    }
    // normalize on copy
-    for (int k = 0; k < n_seq; k++) {
+    for (int i = 0; i < batch.n_tokens; i++) {
-        float * emb = llama_get_embeddings_ith(ctx, k);
+        if (!batch.logits[i]) {
-        float * out = output + k * n_embd;
+            continue;
-        normalize(emb, out, n_embd);
+        }
        // try to get sequence embeddings - supported only when pooling_type is not NONE
        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
        if (embd == NULL) {
            embd = llama_get_embeddings_ith(ctx, i);
            if (embd == NULL) {
                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
                continue;
            }
        }
        float * out = output + batch.seq_id[i][0] * n_embd;
        normalize(embd, out, n_embd);
    }
 }
@ -132,7 +145,7 @@ int main(int argc, char ** argv) {
    // initialize batch
    const int n_prompts = prompts.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
    // allocate output
    const int n_embd = llama_n_embd(model);
@ -145,6 +158,7 @@ int main(int argc, char ** argv) {
    for (int k = 0; k < n_prompts; k++) {
        // clamp to n_batch tokens
        auto & inp = inputs[k];
        const uint64_t n_toks = inp.size();
        // encode if at capacity
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -511,6 +511,14 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
    // tokenized antiprompts
    std::vector<std::vector<llama_token>> antiprompt_ids;
    antiprompt_ids.reserve(params.antiprompt.size());
    for (const std::string & antiprompt : params.antiprompt) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
@ -769,6 +777,18 @@ int main(int argc, char ** argv) {
                    }
                }
                // check for reverse prompt using special tokens
                llama_token last_token = llama_sampling_last(ctx_sampling);
                for (std::vector<llama_token> ids : antiprompt_ids) {
                    if (ids.size() == 1 && last_token == ids[0]) {
                        if (params.interactive) {
                            is_interacting = true;
                        }
                        is_antiprompt = true;
                        break;
                    }
                }
                if (is_antiprompt) {
                    LOG("found antiprompt: %s\n", last_output.c_str());
                }
--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@ -0,0 +1,34 @@
 import asyncio
 import requests
 import numpy as np
 n = 8
 result = []
 async def requests_post_async(*args, **kwargs):
    return await asyncio.to_thread(requests.post, *args, **kwargs)
 async def main():
    model_url = "http://127.0.0.1:6900"
    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
        url= f"{model_url}/embedding",
        json= {"content": str(i)*1024}
    ) for i in range(n)])
    for response in responses:
        embedding = response.json()["embedding"]
        print(embedding[-8:])
        result.append(embedding)
 asyncio.run(main())
 # compute cosine similarity
 for i in range(n-1):
    for j in range(i+1, n):
        embedding1 = np.array(result[i])
        embedding2 = np.array(result[j])
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        print(f"Similarity between {i} and {j}: {similarity:.2f}")
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -417,7 +417,7 @@ struct llama_server_context
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
+            sparams.chat_template = "chatml";
        }
    }
@ -1214,7 +1214,7 @@ struct llama_server_context
        queue_results.send(res);
    }
-    void send_embedding(server_slot &slot)
+    void send_embedding(server_slot & slot, const llama_batch & batch)
    {
        task_result res;
        res.id = slot.task_id;
@ -1223,6 +1223,7 @@ struct llama_server_context
        res.stop = true;
        const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
@ -1233,12 +1234,29 @@ struct llama_server_context
        }
        else
        {
-            const float *data = llama_get_embeddings(ctx);
+            for (int i = 0; i < batch.n_tokens; ++i) {
-            std::vector<float> embedding(data, data + n_embd);
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
-            res.result_json = json
+                    continue;
-            {
+                }
-                {"embedding", embedding},
+
-            };
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
                if (embd == NULL) {
                    embd = llama_get_embeddings_ith(ctx, i);
                    if (embd == NULL) {
                        LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
                        res.result_json = json
                        {
                            {"embedding", std::vector<float>(n_embd, 0.0f)},
                        };
                        continue;
                    }
                }
                res.result_json = json
                {
                    {"embedding", std::vector<float>(embd, embd + n_embd)},
                };
            }
        }
        queue_results.send(res);
    }
@ -1900,7 +1918,7 @@ struct llama_server_context
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
        {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
            for (auto & slot : slots)
            {
@ -1973,7 +1991,7 @@ struct llama_server_context
                // prompt evaluated for embedding
                if (slot.embedding)
                {
-                    send_embedding(slot);
+                    send_embedding(slot, batch_view);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
@ -2055,6 +2073,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --pooling {none,mean,cls}\n");
    printf("                        pooling type for embeddings, use model default if unspecified\n");
    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
@ -2295,6 +2315,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        }
        else if (arg == "--pooling")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
            else { invalid_param = true; break; }
        }
        else if (arg == "--threads" || arg == "-t")
        {
            if (++i >= argc)
@ -2349,7 +2381,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                break;
            }
            params.n_batch = std::stoi(argv[i]);
            params.n_batch = std::min(512, params.n_batch);
        }
        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
        {
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@ -6,3 +6,4 @@ More info:
 - https://github.com/ggerganov/llama.cpp/pull/2926
 - https://github.com/ggerganov/llama.cpp/pull/3624
 - https://github.com/ggerganov/llama.cpp/pull/5625
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -5,6 +5,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -18,6 +19,7 @@ struct seq_draft {
    std::vector<int> i_batch_tgt;
    std::vector<llama_token> tokens;
    std::vector<std::vector<llama_token_data>> dists;
    struct llama_sampling_context * ctx_sampling;
 };
@ -37,12 +39,15 @@ int main(int argc, char ** argv) {
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;
    // probability threshold for accepting a token from the draft model
    const float p_accept = params.p_accept;
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
    std::default_random_engine rng(params.seed);
    std::uniform_real_distribution<> u_dist;
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@ -166,7 +171,9 @@ int main(int argc, char ** argv) {
    std::vector<seq_draft> drafts(n_seq_dft);
    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model
+    if (params.sparams.temp == 0) {
        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
    }
    for (int s = 0; s < n_seq_dft; ++s) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@ -182,12 +189,15 @@ int main(int argc, char ** argv) {
    drafts[0].i_batch_tgt[0] = 0;
    while (true) {
        std::set<int> active_seqs = {};
        // print current draft sequences
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
            }
            active_seqs.insert(s);
            const auto & tokens = drafts[s].tokens;
            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
@ -196,48 +206,156 @@ int main(int argc, char ** argv) {
        int i_dft  = 0;
        int s_keep = 0;
        llama_token token_id;
        std::string token_str;
        // loop until we fail to accept a drafted token or we run out of drafted tokens
        while (true) {
            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
            // sample from the target model
            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
            if (!params.use_color) {
                printf("%s", token_str.c_str());
            }
            if (id == llama_token_eos(model_tgt)) {
                has_eos = true;
            }
            ++n_predict;
            // check if the target token matches any of the drafts
            // for stochastic sampling, attempt to match the token with the drafted tokens
            {
-                bool matches = false;
+                bool accept = false;
                if (params.sparams.temp > 0) {
                    // stochastic verification
-                for (int s = 0; s < n_seq_dft; ++s) {
+                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-                    if (!drafts[s].active) {
+                    float p_tgt = 0, p_dft = 0;
-                        continue;
+
                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
                    while (active_seqs.size() > 0) {
                        // randomly select a sequence to verify from active sequences
                        std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
                        int s = *std::next(active_seqs.begin(), u_int_dist(rng));
                        if (i_dft >= (int) drafts[s].tokens.size()) {
                            drafts[s].active = false;
                            active_seqs.erase(s);
                            continue;
                        }
                        if (accept) {
                            // if we already accepted a token, we can skip the rest
                            if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
                                drafts[s].active = false;
                                active_seqs.erase(s);
                            }
                            continue;
                        }
                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
                        // acquire the token probabilities assigned by the draft and target models
                        for (size_t i = 0; i < dist_tgt.size; i++) {
                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
                                p_tgt = dist_tgt.data[i].p;
                            }
                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
                                p_dft = dist_dft.data[i].p;
                            }
                            if (p_tgt && p_dft) {
                                break;
                            }
                        }
                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                        if (r <= p_tgt / p_dft) {
                            s_keep = s;
                            accept = true;
                            token_id = drafts[s].tokens[i_dft];
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
                        } else {
                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                            drafts[s].active = false;
                            // calculate residual probability
                            GGML_ASSERT(dist_tgt.sorted);
                            GGML_ASSERT(dist_dft.sorted);
                            float sum_probs = 0.0f;
                            // sort dist by id
                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
                                return a.id < b.id;
                            });
                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
                                return a.id < b.id;
                            });
                            for (size_t i = 0; i < dist_tgt.size; i++) {
                                dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
                                sum_probs += dist_tgt.data[i].p;
                            }
                            for (size_t i = 0; i < dist_tgt.size; i++) {
                                dist_tgt.data[i].p /= sum_probs;
                            }
                            // sort dist_tgt by p desc
                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
                                return a.p > b.p;
                            });
                        }
                        active_seqs.erase(s);
                        for(int i = 0; i < n_seq_dft; i++) {
                            if (i == s) {
                                continue;
                            }
                            if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
                                // synchronize active status for sequences with the same drafted token
                                drafts[i].active = drafts[i].active && accept;
                                if (!drafts[i].active) {
                                    active_seqs.erase(s);
                                }
                            }
                        }
                    }
-                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                    if (!accept) {
-                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+                        // all drafted tokens were rejected
                        // sample from the target model
                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
                        token_str = llama_token_to_piece(ctx_tgt, token_id);
                    }
-                        s_keep = s;
+                } else {
-                        matches = true;
+                    // greedy verification
-                    } else {
+
-                        drafts[s].active = false;
+                    // sample from the target model
                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
                    token_str = llama_token_to_piece(ctx_tgt, token_id);
                    for (int s = 0; s < n_seq_dft; ++s) {
                        if (!drafts[s].active) {
                            continue;
                        }
                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
                            s_keep = s;
                            accept = true;
                        } else {
                            drafts[s].active = false;
                        }
                    }
                }
-                if (matches) {
+                if (token_id == llama_token_eos(model_tgt)) {
                    has_eos = true;
                }
                ++n_predict;
                if (accept) {
                    ++n_accept;
                    ++n_past_tgt;
                    ++n_past_dft;
@ -245,17 +363,21 @@ int main(int argc, char ** argv) {
                    if (params.use_color) {
                        // Color token according to its origin sequence
                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
-                        fflush(stdout);
+                    } else {
                        printf("%s", token_str.c_str());
                    }
                    fflush(stdout);
                    continue;
                } else {
                    printf("%s", token_str.c_str());
                    fflush(stdout);
                    break;
                }
            }
-            if (params.use_color) {
+        }
                printf("%s", token_str.c_str());
            }
            fflush(stdout);
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+        {
            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
            // TODO: simplify
            {
@ -275,21 +397,21 @@ int main(int argc, char ** argv) {
                drafts[s].active = false;
                drafts[s].tokens.clear();
                drafts[s].i_batch_tgt.clear();
                drafts[s].dists.clear();
            }
            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(id);
+            drafts[0].tokens.push_back(token_id);
            drafts[0].dists.push_back(std::vector<llama_token_data>());
            drafts[0].i_batch_tgt.push_back(0);
            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode         (ctx_dft, batch_dft);
+            llama_decode(ctx_dft, batch_dft);
            ++n_past_dft;
            break;
        }
        if (n_predict > params.n_predict || has_eos) {
@ -334,12 +456,6 @@ int main(int argc, char ** argv) {
                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                }
                if (cur_p[0].p < p_accept) {
                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                    drafts[s].drafting = false;
                    continue;
                }
                std::vector<int> sa(1, s);
                // attempt to split the branch if the probability is high enough
@ -367,6 +483,7 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].skip     = true;
                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
                        drafts[n_seq_cur].dists       = drafts[s].dists;
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
@ -389,6 +506,8 @@ int main(int argc, char ** argv) {
                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
                    drafts[s].tokens.push_back(id);
                    // save cur_p.data into drafts[s].dists
                    drafts[s].dists.push_back(cur_p);
                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@ -440,6 +559,7 @@ int main(int argc, char ** argv) {
            }
            drafts[s].tokens.erase(drafts[s].tokens.begin());
            drafts[s].dists.erase(drafts[s].dists.begin());
        }
    }
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -91,13 +91,14 @@ extern "C" {
        // (optional) complete all pending operations
        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-        // compute graph with a plan
+        // create a plan for ggml_cgraph and free it
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph with a plan
        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
-        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
    backend->iface.graph_plan_free(backend, plan);
 }
-void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_compute(backend, plan);
+    return backend->iface.graph_plan_compute(backend, plan);
 }
-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    return backend->iface.graph_compute(backend, cgraph);
 }
@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
    GGML_UNUSED(backend);
 }
-GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
    GGML_UNUSED(backend);
 }
-GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
    cplan.abort_callback      = cpu_ctx->abort_callback;
    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-    ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph, &cplan);
    return true;
 }
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    return true;
 }
-static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
+static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
    uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
    uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
-            if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
+            enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
-                return false;
+            if (ec != GGML_STATUS_SUCCESS) {
                return ec;
            }
            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
-                if (!ggml_backend_graph_compute(split_backend, &gv)) {
+                enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
-                    return false;
+                if (ec != GGML_STATUS_SUCCESS) {
                    return ec;
                }
                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
    }
 #endif
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
    return true;
 }
-bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
    if (!sched->is_reset) {
@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
    ggml_backend_sched_split_graph(sched, graph);
    if (!ggml_backend_sched_alloc_splits(sched)) {
-        return false;
+        return GGML_STATUS_ALLOC_FAILED;
    }
-    if (!ggml_backend_sched_compute_splits(sched)) {
+    return ggml_backend_sched_compute_splits(sched);
        return false;
    }
    return true;
 }
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -66,12 +66,13 @@ extern "C" {
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@ -157,26 +158,26 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    // Get the number of splits of the last graph
-    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API void                 ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
    //
    // Utils
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
 #define CUDA_UPSCALE_BLOCK_SIZE 256
 #define CUDA_CONCAT_BLOCK_SIZE 256
 #define CUDA_PAD_BLOCK_SIZE 256
 #define CUDA_ARANGE_BLOCK_SIZE 256
 #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -990,17 +992,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
            nidx +
            blockIdx.y * ne0 +
            blockIdx.z * ne0 * gridDim.y;
-            dst[offset_dst] = x[offset_src];
+        dst[offset_dst] = x[offset_src];
    } else {
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-            dst[offset_dst] = y[offset_src];
+        dst[offset_dst] = y[offset_src];
    }
 }
-static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
+static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
    // blockIdx.z: idx of ne02*ne03
    // blockIdx.y: idx of ne01*scale_factor， aka ne1
    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
    // ne00xne01: ne00 * ne01
    int ne0 = ne00 * scale_factor;
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
@ -1012,7 +1018,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
    int offset_src =
        i00 +
        i01 * ne00 +
-        blockIdx.z * nb02;
+        blockIdx.z * ne00xne01;
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
@ -1020,7 +1026,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
    dst[offset_dst] = x[offset_src];
 }
-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
    // blockIdx.y: idx of ne1
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
@ -1031,19 +1040,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
        int offset_src =
            nidx +
            blockIdx.y * ne00 +
            blockIdx.z * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
+        dst[offset_dst] = x[offset_src];
    } else {
        dst[offset_dst] = 0.0f;
    }
 }
 static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    dst[nidx] = start + step * nidx;
 }
 static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
    // blockIDx.y: idx of timesteps->ne[0]
    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
    int i = blockIdx.y;
    int j = threadIdx.x + blockIdx.x * blockDim.x;
    float * embed_data = (float *)((char *)dst +  i*nb1);
    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
        embed_data[dim] = 0.f;
    }
    int half = dim / 2;
    if (j >= half) {
        return;
    }
    float timestep = timesteps[i];
    float freq = (float)expf(-logf(max_period) * j / half);
    float arg = timestep * freq;
    embed_data[j] = cosf(arg);
    embed_data[j + half] = sinf(arg);
 }
 template <int block_size>
 static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    // blockIdx.x: num_groups idx
    // threadIdx.x: block_size idx
    int start = blockIdx.x * group_size;
    int end = start + group_size;
@ -6448,7 +6491,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
@ -6456,17 +6499,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-    const int i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
    cpy_1(cx + x_offset, cdst + dst_offset);
 }
@ -6956,23 +6999,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
 template <typename T>
 static  __global__ void im2col_kernel(
-        const float * x, T * dst, int batch_offset,
+        const float * x, T * dst, int64_t batch_offset,
-        int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
+        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
        int s0, int s1, int p0, int p1, int d0, int d1) {
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i >= pelements) {
        return;
    }
-    const int ksize = OW * (KH > 1 ? KW : 1);
+    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
-    const int kx = i / ksize;
+    const int64_t  kx = i / ksize;
-    const int kd = kx * ksize;
+    const int64_t  kd = kx * ksize;
-    const int ky = (i - kd) / OW;
+    const int64_t  ky = (i - kd) / OW;
-    const int ix = i % OW;
+    const int64_t  ix = i % OW;
-    const int oh = blockIdx.y;
+    const int64_t  oh = blockIdx.y;
-    const int batch = blockIdx.z / IC;
+    const int64_t  batch = blockIdx.z / IC;
-    const int ic = blockIdx.z % IC;
+    const int64_t  ic = blockIdx.z % IC;
    const int64_t iiw = ix * s0 + kx * d0 - p0;
    const int64_t iih = oh * s1 + ky * d1 - p1;
@ -7298,19 +7341,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
 }
-static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
+static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
                             const int scale_factor, cudaStream_t stream) {
    int ne0 = (ne00 * scale_factor);
    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
+    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
 }
 static void pad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02,
+    const int ne00, const int ne01, const int ne02, const int ne03,
-    const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 }
 static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
 }
 static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
                                        const int dim, const int max_period, cudaStream_t stream) {
    int half_ceil = (dim + 1) / 2;
    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne00, 1);
    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
 }
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
@ -8443,8 +8500,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
 template <typename T>
 static void im2col_cuda(const float* x, T* dst,
-    int IW, int IH, int OW, int OH, int KW, int KH, int IC,
+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
-    int batch, int batch_offset, int offset_delta,
+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
    const int parallel_elements = OW * KW * KH;
    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
@ -9123,7 +9180,7 @@ static void ggml_cuda_op_group_norm(
    int num_groups = dst->op_params[0];
    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
    (void) src1;
    (void) dst;
@ -9156,7 +9213,7 @@ static void ggml_cuda_op_upscale(
    const int scale_factor = dst->op_params[0];
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
    (void) src1;
    (void) dst;
@ -9172,8 +9229,49 @@ static void ggml_cuda_op_pad(
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
    pad_f32_cuda(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
    (void) src1;
    (void) dst;
    (void) src1_dd;
 }
 static void ggml_cuda_op_arange(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    float start;
    float stop;
    float step;
    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
    int64_t steps = (int64_t)ceil((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);
    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
    (void) src0;
    (void) src1;
    (void) src0_dd;
    (void) src1_dd;
 }
 static void ggml_cuda_op_timestep_embedding(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];
    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
    (void) src1;
    (void) dst;
@ -10458,6 +10556,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }
 static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
    const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
    // dd = data device
    float * src0_ddf = nullptr;
    float * src1_ddf = nullptr;
    float *  dst_ddf = nullptr;
    cuda_pool_alloc<float>  dst_f;
    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
    if (dst_on_device) {
        dst_ddf = (float *) dst_extra->data_device[g_main_device];
    } else {
        dst_ddf = dst_f.alloc(ggml_nelements(dst));
    }
    // do the computation
    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
    CUDA_CHECK(cudaGetLastError());
    // copy dst to host if necessary
    if (!dst_on_device) {
        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
    }
    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        CUDA_CHECK(cudaDeviceSynchronize());
    }
 }
 static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
 }
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
@ -11358,6 +11495,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
        case GGML_OP_ARANGE:
            func = ggml_cuda_arange;
            break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            func = ggml_cuda_timestep_embedding;
            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
@ -12098,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }
-GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_cuda_set_main_device(cuda_ctx->device);
@ -12134,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
        GGML_ASSERT(ok);
    }
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
@ -12253,6 +12396,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default:
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
    return ggml_backend_kompute_buffer_type(ctx->device);
 }
-static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
    ggml_vk_graph_compute(ctx, cgraph);
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_IM2COL_F32,
    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
    GGML_METAL_KERNEL_TYPE_PAD_F32,
    GGML_METAL_KERNEL_TYPE_ARANGE_F32,
    GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                im2col_f32,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,    timestep_embedding_f32, true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                arange_f32,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
            return false;
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
        case GGML_OP_LEAKY_RELU:
            return true;
@ -742,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
    }
 }
-static bool ggml_metal_graph_compute(
+static enum ggml_status ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute(
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
-                        const float scale = *(const float *) dst->op_params;
+                        float scale;
                        memcpy(&scale, dst->op_params, sizeof(scale));
                        int64_t n = ggml_nelements(dst);
@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute(
                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
                        }
-                        const float scale    = ((float *) dst->op_params)[0];
+                        float scale;
-                        const float max_bias = ((float *) dst->op_params)[1];
+                        float max_bias;
                        memcpy(&scale,    ((int32_t *) dst->op_params) + 0, sizeof(scale));
                        memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
                        const int64_t nrows_x = ggml_nrows(src0);
                        const int64_t nrows_y = src0->ne[1];
                        const uint32_t n_head_kv   = nrows_x/nrows_y;
                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute(
                        //const int n_past = ((int32_t *) dst->op_params)[0];
                        const int n_head = ((int32_t *) dst->op_params)[1];
                        float max_bias;
                        memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute(
                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                    } break;
                case GGML_OP_ARANGE:
                    {
                        GGML_ASSERT(dst->type == GGML_TYPE_F32);
                        float start;
                        float step;
                        memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
                        memcpy(&step,  ((int32_t *) dst->op_params) + 2, sizeof(float));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
                        [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
                        [encoder setBytes:&start length:sizeof(start) atIndex:2];
                        [encoder setBytes:&step  length:sizeof(step)  atIndex:3];
                        const int nth = MIN(1024, ne0);
                        [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                    } break;
                case GGML_OP_TIMESTEP_EMBEDDING:
                    {
                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
                        const int dim        = dst->op_params[0];
                        const int max_period = dst->op_params[1];
                        const int half = dim / 2;
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                        [encoder setBytes:&nb1   length:sizeof(nb1) atIndex:2];
                        [encoder setBytes:&dim   length:sizeof(dim) atIndex:3];
                        [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
                        const int nth = MIN(1024, half);
                        [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                    } break;
                case GGML_OP_ARGSORT:
                    {
                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -2428,7 +2484,7 @@ static bool ggml_metal_graph_compute(
        MTLCommandBufferStatus status = [command_buffer status];
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return false;
+            return GGML_STATUS_FAILED;
        }
    }
@ -2437,7 +2493,7 @@ static bool ggml_metal_graph_compute(
    }
    }
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -2739,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
    UNUSED(backend);
 }
-GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
    return ggml_metal_graph_compute(metal_ctx, cgraph);
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
    }
 }
 kernel void kernel_arange_f32(
    device        char * dst,
    constant   int64_t & ne0,
    constant   float   & start,
    constant   float   & step,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
    device float * dst_ptr = (device float *) dst;
    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
        dst_ptr[i0] = start + step * i0;
    }
 }
 kernel void kernel_timestep_embedding_f32(
    device  const char * src0,
    device        char * dst,
    constant  uint64_t & nb1,
    constant  int      & dim,
    constant  int      & max_period,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
    int i = tgpig.x;
    device float * embed_data = (device float *)(dst +  i*nb1);
    int half_ = dim / 2;
    for (int j = tpitg.x; j < half_; j += ntg.x) {
        float timestep = ((device float *)src0)[i];
        float freq = (float)exp(-log((float)max_period) * j / half_);
        float arg = timestep * freq;
        embed_data[j        ] = cos(arg);
        embed_data[j + half_] = sin(arg);
    }
    if (dim % 2 != 0 && tpitg.x == 0) {
        embed_data[dim] = 0.f;
    }
 }
 // bitonic sort implementation following the CUDA kernels as reference
 typedef void (argsort_t)(
        device const float * x,
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
    GGML_UNUSED(backend);
 }
-static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * node = graph->nodes[i];
        switch (node->op) {
@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap
        }
    }
-    return true;
+    return GGML_STATUS_SUCCESS;
    GGML_UNUSED(backend);
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -51,6 +51,7 @@
 #define UNUSED GGML_UNUSED
 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
        const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
        const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
-        const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
+        const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
        const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
        const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
        const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-        const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
+        const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
-        const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
+        const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
        const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
-            const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
+            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
+            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
            __m256i signs;
            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@ -10551,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
-        const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
@ -10661,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-            const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -10,6 +10,7 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
 GGML_API void ggml_vk_instance_init(void);
 GGML_API void ggml_vk_init_cpu_assist(void);
 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
--- a/ggml.c
+++ b/ggml.c
@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 const char * ggml_status_to_string(enum ggml_status status) {
    switch (status) {
        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
        case GGML_STATUS_SUCCESS:      return "GGML status: success";
        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
    }
    return "GGML status: unknown";
 }
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "POOL_2D",
    "UPSCALE",
    "PAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
    "LEAKY_RELU",
@ -1852,7 +1865,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1910,6 +1923,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "pool_2d(x)",
    "upscale(x)",
    "pad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
    "leaky_relu(x)",
@ -1940,7 +1955,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -2899,11 +2914,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
    return ((const int32_t *)(tensor->op_params))[i];
 }
 static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
    return ((const float *)(tensor->op_params))[i];
 }
 static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
    ((int32_t *)(tensor->op_params))[i] = value;
 }
 static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
    ((float *)(tensor->op_params))[i] = value;
 }
 struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
    memset(tensor->data, 0, ggml_nbytes(tensor));
    return tensor;
@ -5902,6 +5927,55 @@ struct ggml_tensor * ggml_upscale(
    return ggml_upscale_impl(ctx, a, scale_factor);
 }
 struct ggml_tensor * ggml_arange(
    struct ggml_context * ctx,
    float start,
    float stop,
    float step) {
    GGML_ASSERT(stop > start);
    const int64_t steps = (int64_t) ceilf((stop - start) / step);
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
    result->op = GGML_OP_ARANGE;
    ggml_set_op_params_f32(result, 0, start);
    ggml_set_op_params_f32(result, 1, stop);
    ggml_set_op_params_f32(result, 2, step);
    return result;
 }
 struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
            int                   dim,
            int                   max_period) {
    bool is_node = false;
    if (timesteps->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }
    int actual_dim = dim;
    if (dim % 2 != 0) {
        actual_dim = dim + 1;
    }
    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
    result->op = GGML_OP_TIMESTEP_EMBEDDING;
    ggml_set_op_params_i32(result, 0, dim);
    ggml_set_op_params_i32(result, 1, max_period);
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = timesteps;
    return result;
 }
 // ggml_argsort
 struct ggml_tensor * ggml_argsort(
@ -10337,7 +10411,7 @@ static void ggml_compute_forward_group_norm_f32(
    int n_channels = src0->ne[2];
    int n_groups = dst->op_params[0];
    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
-    for (int i = ith; i < n_groups; i+=nth) {
+    for (int i = ith; i < n_groups; i += nth) {
        int start = i * n_channels_per_group;
        int end = start + n_channels_per_group;
        if (end > n_channels) {
@ -10351,28 +10425,32 @@ static void ggml_compute_forward_group_norm_f32(
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
                    ggml_float sumr = 0.0;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00];
+                        sumr += (ggml_float)x[i00];
                    }
                    sum += sumr;
                }
            }
-            float mean = sum / (ne00 * ne01 * step);
+            const float mean = sum / (ne00 * ne01 * step);
            ggml_float sum2 = 0.0;
            ggml_float sum2 = 0.0;
            for (int64_t i02 = start; i02 < end; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
                    ggml_float sumr = 0.0;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        float v = x[i00] - mean;
                        y[i00] = v;
-                        sum2 += (ggml_float)(v * v);
+                        sumr += (ggml_float)(v * v);
                    }
                    sum2 += sumr;
                }
            }
-            float variance = sum2 / (ne00 * ne01 * step);
+            const float variance = sum2 / (ne00 * ne01 * step);
            const float scale = 1.0f / sqrtf(variance + eps);
            for (int64_t i02 = start; i02 < end; i02++) {
@ -13653,6 +13731,106 @@ static void ggml_compute_forward_pad(
    }
 }
 // ggml_compute_forward_arange
 static void ggml_compute_forward_arange_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }
    GGML_ASSERT(dst->nb[0] == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    const float start = ggml_get_op_params_f32(dst, 0);
    const float stop  = ggml_get_op_params_f32(dst, 1);
    const float step  = ggml_get_op_params_f32(dst, 2);
    const int64_t steps = (int64_t) ceilf((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);
    for (int64_t i = ith; i < steps; i+= nth) {
        float value = start + step * i;
        ((float *)dst->data)[i] = value;
    }
 }
 static void ggml_compute_forward_arange(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    switch (dst->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_arange_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 static void ggml_compute_forward_timestep_embedding_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }
    const struct ggml_tensor * src0 = dst->src[0];
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    GGML_TENSOR_UNARY_OP_LOCALS
    const int dim = ggml_get_op_params_i32(dst, 0);
    const int max_period = ggml_get_op_params_i32(dst, 1);
    int half = dim / 2;
    for (int64_t i = 0; i < ne00; i++) {
        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
        for (int64_t j = ith; j < half; j += nth) {
            float timestep = ((float *)src0->data)[i];
            float freq = (float)expf(-logf(max_period) * j / half);
            float arg = timestep * freq;
            embed_data[j] = cosf(arg);
            embed_data[j + half] = sinf(arg);
        }
        if (dim % 2 != 0 && ith == 0) {
            embed_data[dim] = 0.f;
        }
    }
 }
 static void ggml_compute_forward_timestep_embedding(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_timestep_embedding_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_argsort
 static void ggml_compute_forward_argsort_f32(
@ -15972,6 +16150,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad(params, tensor);
            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor);
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                ggml_compute_forward_timestep_embedding(params, tensor);
            } break;
        case GGML_OP_ARGSORT:
            {
                ggml_compute_forward_argsort(params, tensor);
@ -16982,6 +17168,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARANGE:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARGSORT:
            {
                GGML_ASSERT(false); // TODO: not implemented
@ -17587,6 +17781,7 @@ struct ggml_compute_state {
    ggml_thread_t thrd;
    int ith;
    struct ggml_compute_state_shared * shared;
    enum ggml_status ec;
 };
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@ -17738,6 +17933,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARANGE:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARGSORT:
            {
                n_tasks = n_threads;
@ -17877,7 +18080,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
+            state->ec = GGML_STATUS_ABORTED;
            return 0;
        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@ -17999,7 +18203,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        }
    }
-    return GGML_EXIT_SUCCESS;
+    return 0;
 }
 struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@ -18195,7 +18399,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
    return cplan;
 }
-int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    {
        GGML_ASSERT(cplan);
        GGML_ASSERT(cplan->n_threads > 0);
@ -18239,6 +18443,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
                .thrd   = 0,
                .ith = j,
                .shared = &state_shared,
                .ec = GGML_STATUS_SUCCESS,
            };
            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@ -18249,12 +18454,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    workers[0].ith = 0;
    workers[0].shared = &state_shared;
    workers[0].ec = GGML_STATUS_SUCCESS;
    const int64_t perf_start_cycles  = ggml_perf_cycles();
    const int64_t perf_start_time_us = ggml_perf_time_us();
    // this is a work thread too
-    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
+    ggml_graph_compute_thread(&workers[0]);
    enum ggml_status compute_status = workers[0].ec;
    // don't leave affinity set on the main thread
    clear_numa_thread_affinity();
@ -18264,6 +18471,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        for (int j = 1; j < n_threads; j++) {
            const int rc = ggml_thread_join(workers[j].thrd, NULL);
            GGML_ASSERT(rc == 0);
            if (workers[j].ec != GGML_STATUS_SUCCESS)
                compute_status = workers[j].ec;
        }
    }
@ -18291,14 +18500,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    return compute_status;
 }
-void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-    ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph, &cplan);
 }
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
--- a/ggml.h
+++ b/ggml.h
@ -315,6 +315,16 @@
 extern "C" {
 #endif
    enum ggml_status {
        GGML_STATUS_ALLOC_FAILED = -2,
        GGML_STATUS_FAILED = -1,
        GGML_STATUS_SUCCESS = 0,
        GGML_STATUS_ABORTED = 1,
    };
    // get ggml_status name string
    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
    typedef uint16_t ggml_fp16_t;
    // convert FP16 <-> FP32
@ -454,6 +464,8 @@ extern "C" {
        GGML_OP_POOL_2D,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
        GGML_OP_LEAKY_RELU,
@ -1663,6 +1675,15 @@ extern "C" {
            int                  p2,
            int                  p3);
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
    GGML_API struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
            int                   dim,
            int                   max_period);
    // sort rows
    enum ggml_sort_order {
        GGML_SORT_ORDER_ASC,
@ -1674,6 +1695,12 @@ extern "C" {
            struct ggml_tensor  * a,
            enum ggml_sort_order  order);
    GGML_API struct ggml_tensor * ggml_arange(
            struct ggml_context * ctx,
            float                 start,
            float                 stop,
            float                 step);
    // top k elements per row
    GGML_API struct ggml_tensor * ggml_top_k(
            struct ggml_context * ctx,
@ -1942,12 +1969,11 @@ extern "C" {
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@ -15,7 +15,7 @@ array  ::=
 string ::=
  "\"" (
-    [^"\\] |
+    [^"\\\x7F\x00-\x1F] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@ -24,7 +24,7 @@ array  ::=
 string ::=
  "\"" (
-    [^"\\] |
+    [^"\\\x7F\x00-\x1F] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
--- a/llama.cpp
+++ b/llama.cpp
@ -1726,7 +1726,7 @@ struct llama_hparams {
 };
 struct llama_cparams {
-    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
    uint32_t n_threads;       // number of threads to use for generation
    uint32_t n_threads_batch; // number of threads to use for batch processing
@ -1743,7 +1743,9 @@ struct llama_cparams {
    float yarn_beta_slow;
    float defrag_thold;
    bool embeddings;
    bool offload_kqv;
    enum llama_pooling_type pooling_type;
    ggml_backend_sched_eval_callback cb_eval;
@ -2052,7 +2054,7 @@ struct llama_context {
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    int32_t n_eval   = 0; // number of eval calls
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    // logits output (2-dimensional array: [n_tokens][n_vocab])
    std::vector<float> logits;
 #ifndef NDEBUG
    // guard against access to unset logits
@ -2060,8 +2062,13 @@ struct llama_context {
 #endif
    bool logits_all = false;
-    // input embedding (1-dimensional array: [n_embd])
+    // embeddings output (2-dimensional array: [n_tokens][n_embd])
-    std::vector<float> embedding;
+    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
    std::vector<float> embd;
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;
@ -5301,8 +5308,8 @@ static struct ggml_tensor * llm_build_kqv(
        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
    }
-#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
-#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
+#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
 #pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024")
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5488")
    if (hparams.f_max_alibi_bias > 0.0f) {
@ -5388,6 +5395,7 @@ static struct ggml_tensor * llm_build_kv(
    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
    struct ggml_tensor * cur;
    cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
            q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
    cb(cur, "kqv_out", il);
@ -6402,6 +6410,7 @@ struct llm_build_context {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        struct ggml_tensor * cur;
@ -6409,9 +6418,10 @@ struct llm_build_context {
        // get input vectors with right size
        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+
        struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
+        struct ggml_tensor * inp_cls  = ggml_view_1d(ctx0, lctx.inp_cls,  n_tokens, 0);
        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@ -6429,39 +6439,38 @@ struct llm_build_context {
        cb(inpL, "inp_norm", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
+        struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
-        cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
+        cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
        // iterate layers
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * cur = inpL;
            struct ggml_tensor * Qcur;
            struct ggml_tensor * Kcur;
            struct ggml_tensor * Vcur;
            // self-attention
            if (model.arch == LLM_ARCH_BERT) {
-                struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
+                Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
                cb(Qcur, "Qcur", il);
-                struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
+                Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
                cb(Kcur, "Kcur", il);
-                struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
+                Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
                cb(Vcur, "Vcur", il);
-                // seems like we just need to do this for Q?
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            } else {
                // compute Q and K and RoPE them
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
@ -6480,13 +6489,41 @@ struct llm_build_context {
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Kcur, "Kcur", il);
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
            cb(kq, "kq", il);
            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
            cb(kq, "kq_soft_max_ext", il);
            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
            cb(v, "v", il);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
            cb(kqv, "kqv", il);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
            cb(kqv_merged, "kqv_merged", il);
            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
            cb(cur, "kqv_merged_cont", il);
            ggml_build_forward_expand(gf, cur);
            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
            if (model.layers[il].bo) {
                cb(cur, "kqv_wo", il);
            }
            if (model.layers[il].bo) {
                cur = ggml_add(ctx0, cur, model.layers[il].bo);
            }
            cb(cur, "kqv_out", il);
            // re-add the layer input
            cur = ggml_add(ctx0, cur, inpL);
@ -6526,16 +6563,29 @@ struct llm_build_context {
        // final output
        cur = inpL;
        cb(cur, "result_embd", -1);
        // pooling layer
-        if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        switch (pooling_type) {
-            cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
+            case LLAMA_POOLING_TYPE_NONE:
-        } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
+                {
-            cur = ggml_get_rows(ctx0, cur, inp_cls);
+                    // nop
-        } else {
+                } break;
-            GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
+            case LLAMA_POOLING_TYPE_MEAN:
                {
                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
                    cb(cur, "result_embd_pooled", -1);
                } break;
            case LLAMA_POOLING_TYPE_CLS:
                {
                    cur = ggml_get_rows(ctx0, cur, inp_cls);
                    cb(cur, "result_embd_pooled", -1);
                } break;
            case LLAMA_POOLING_TYPE_UNSPECIFIED:
                {
                    GGML_ASSERT(false && "Invalid pooling type");
                } break;
        }
        cb(cur, "result_embd", -1);
        ggml_build_forward_expand(gf, cur);
@ -8467,7 +8517,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
    }
-    {
+    if (hparams.causal_attn) {
        const int64_t n_kv     = kv_self.n;
        const int64_t n_tokens = batch.n_tokens;
@ -8475,7 +8525,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        float * data = (float *) lctx.inp_KQ_mask->data;
-        // For Transformers, use only the previous KV cells (or all, when non-causal)
+        // For causal attention, use only the previous KV cells
        // of the correct sequence for each token of the batch.
        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
        for (int h = 0; h < 1; ++h) {
@ -8485,16 +8535,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                for (int i = 0; i < n_kv; ++i) {
                    float f;
-                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
+                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
                        (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
                        f = -INFINITY;
                    } else {
-                        f = 0;
+                        f = 0.0f;
                    }
                    data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
                }
            }
        }
    } else {
        // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
        const int64_t n_tokens = batch.n_tokens;
        assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
        float * data = (float *) lctx.inp_KQ_mask->data;
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_tokens; ++i) {
                    float f = -INFINITY;
                    for (int s = 0; s < batch.n_seq_id[i]; ++s) {
                        if (batch.seq_id[i][s] == seq_id) {
                            f = 0.0f;
                            break;
                        }
                    }
                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
                }
            }
        }
    }
    if (hparams.need_kq_pos) {
@ -8513,13 +8587,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        const int64_t n_tokens = batch.n_tokens;
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
        float * data = (float *) lctx.inp_mean->data;
        float * data = (float *) lctx.inp_mean->data;
        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
        std::vector<uint64_t> sum(n_tokens, 0);
        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
            sum[seq_id] += 1;
        }
@ -8541,11 +8618,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        const int64_t n_tokens = batch.n_tokens;
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
-            const llama_pos pos = batch.pos[i];
+            const llama_pos    pos    = batch.pos[i];
            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
            if (pos == 0) {
                data[seq_id] = i;
            }
@ -8706,27 +8788,30 @@ static int llama_decode_internal(
        batch.seq_id = seq_id_arr.data();
    }
-    llama_kv_cache_update(&lctx);
+    // non-causal masks do not use the KV cache
    if (hparams.causal_attn) {
        llama_kv_cache_update(&lctx);
-    // if we have enough unused cells before the current head ->
+        // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
+        //   better to start searching from the beginning of the cache, hoping to fill it
-    if (kv_self.head > kv_self.used + 2*n_tokens) {
+        if (kv_self.head > kv_self.used + 2*n_tokens) {
-        kv_self.head = 0;
+            kv_self.head = 0;
        }
        if (!llama_kv_cache_find_slot(kv_self, batch)) {
            return 1;
        }
        if (!kv_self.recurrent) {
            // a heuristic, to avoid attending the full cache if it is not yet utilized
            // after enough generations, the benefit from this heuristic disappears
            // if we start defragmenting the cache, the benefit from this will be more important
            kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
            //kv_self.n = llama_kv_cache_cell_max(kv_self);
        }
    }
-    if (!llama_kv_cache_find_slot(kv_self, batch)) {
+    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
        return 1;
    }
    if (!kv_self.recurrent) {
        // a heuristic, to avoid attending the full cache if it is not yet utilized
        // after enough generations, the benefit from this heuristic disappears
        // if we start defragmenting the cache, the benefit from this will be more important
        kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
        //kv_self.n = llama_kv_cache_cell_max(kv_self);
        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
    }
    ggml_backend_sched_reset(lctx.sched);
    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
@ -8734,20 +8819,26 @@ static int llama_decode_internal(
    ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
    // the output is always the last tensor in the graph
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+    struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
-    if (strcmp(res->name, "result_output") == 0) {
+    if (!hparams.causal_attn) {
-        // the embeddings could be the second to last tensor, or the third to last tensor
+        res = nullptr; // do not extract logits for embedding models such as BERT
-        if (strcmp(embeddings->name, "result_norm") != 0) {
+
-            embeddings = gf->nodes[gf->n_nodes - 3];
+        // token or sequence embeddings
-            GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
+        embd = gf->nodes[gf->n_nodes - 1];
-        }
+
-    } else if (strcmp(res->name, "result_embd") == 0) {
+        GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
        embeddings = res;
        res = nullptr;
    } else {
-        GGML_ASSERT(false);
+        if (strcmp(res->name, "result_output") == 0) {
            // the token embeddings could be the second to last tensor, or the third to last tensor
            if (strcmp(embd->name, "result_norm") != 0) {
                embd = gf->nodes[gf->n_nodes - 3];
                GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
            }
        } else {
            GGML_ASSERT(false && "missing result_output tensor");
        }
    }
    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -8814,46 +8905,82 @@ static int llama_decode_internal(
        logits_out.clear();
 #endif
-        ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
+        ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
-        GGML_ASSERT(res_backend != nullptr);
+        GGML_ASSERT(backend_res != nullptr);
        if (batch.logits) {
            logits_out.resize(n_vocab * n_tokens);
            for (uint32_t i = 0; i < n_tokens; i++) {
                if (batch.logits[i] == 0) {
                    continue;
                }
-                ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
+                ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
 #ifndef NDEBUG
                logits_valid[i] = true;
 #endif
            }
        } else if (lctx.logits_all) {
            logits_out.resize(n_vocab * n_tokens);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
+            ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
 #ifndef NDEBUG
            std::fill(logits_valid.begin(), logits_valid.end(), true);
 #endif
        } else {
            logits_out.resize(n_vocab);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
+            ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
 #ifndef NDEBUG
            logits_valid[0] = true;
 #endif
        }
-        ggml_backend_synchronize(res_backend);
+        ggml_backend_synchronize(backend_res);
    }
    // extract embeddings
-    if (!lctx.embedding.empty()) {
+    if (cparams.embeddings && embd) {
-        auto & embedding_out = lctx.embedding;
+        ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
        GGML_ASSERT(backend_embd != nullptr);
-        const int64_t embd_pos  = res ? n_embd * (n_tokens-1) : 0;
+        switch (cparams.pooling_type) {
-        const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
+            case LLAMA_POOLING_TYPE_NONE:
                {
                    // extract token embeddings
                    auto & embd_out = lctx.embd;
-        embedding_out.resize(embd_size);
+                    if (batch.logits) {
-        ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
+                        embd_out.resize(n_embd * n_tokens);
-        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
+                        for (uint32_t i = 0; i < n_tokens; i++) {
-        ggml_backend_synchronize(embeddings_backend);
+                            if (batch.logits[i] == 0) {
                                continue;
                            }
                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
                        }
                    }
                } break;
            case LLAMA_POOLING_TYPE_CLS:
            case LLAMA_POOLING_TYPE_MEAN:
                {
                    GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
                    // extract sequence embeddings
                    auto & embd_seq_out = lctx.embd_seq;
                    embd_seq_out.clear();
                    for (uint32_t i = 0; i < n_tokens; i++) {
                        const llama_seq_id seq_id = batch.seq_id[i][0];
                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                            continue;
                        }
                        embd_seq_out[seq_id].resize(n_embd);
                        ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                    }
                } break;
            case LLAMA_POOLING_TYPE_UNSPECIFIED:
                {
                    GGML_ASSERT(false && "unknown pooling type");
                } break;
        }
        ggml_backend_synchronize(backend_embd);
    }
    // measure the performance only for the single-token evals
@ -9167,19 +9294,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto& token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
-    case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM: {
-        auto buf = token_data.text.substr(3, 2);
+            auto buf = token_data.text.substr(3, 2);
-        return strtol(buf.c_str(), NULL, 16);
+            return strtol(buf.c_str(), NULL, 16);
-    }
+        }
-    case LLAMA_VOCAB_TYPE_BPE: {
+        case LLAMA_VOCAB_TYPE_BPE: {
-        GGML_ASSERT(false);
+            GGML_ASSERT(false);
-        return unicode_to_bytes_bpe(token_data.text);
+            return unicode_to_bytes_bpe(token_data.text);
-    }
+        }
-    case LLAMA_VOCAB_TYPE_WPM: {
+        case LLAMA_VOCAB_TYPE_WPM: {
-        GGML_ASSERT(false);
+            GGML_ASSERT(false);
-    }
+        }
-    default:
+        default:
-        GGML_ASSERT(false);
+            GGML_ASSERT(false);
    }
 }
@ -12430,7 +12557,7 @@ struct llama_context_params llama_context_default_params() {
        /*.type_k                      =*/ GGML_TYPE_F16,
        /*.type_v                      =*/ GGML_TYPE_F16,
        /*.logits_all                  =*/ false,
-        /*.embedding                   =*/ false,
+        /*.embeddings                  =*/ false,
        /*.offload_kqv                 =*/ true,
        /*.abort_callback              =*/ nullptr,
        /*.abort_callback_data         =*/ nullptr,
@ -12582,6 +12709,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.yarn_beta_fast   = params.yarn_beta_fast;
    cparams.yarn_beta_slow   = params.yarn_beta_slow;
    cparams.defrag_thold     = params.defrag_thold;
    cparams.embeddings       = params.embeddings;
    cparams.offload_kqv      = params.offload_kqv;
    cparams.pooling_type     = params.pooling_type;
@ -12769,8 +12897,8 @@ struct llama_context * llama_new_context_with_model(
        // resized during inference, reserve maximum
        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
-        if (params.embedding) {
+        if (params.embeddings) {
-            ctx->embedding.resize(hparams.n_embd);
+            ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
        }
        // graph inputs
@ -13220,7 +13348,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
    // assume worst case for logits although only currently set ones are serialized
    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
    const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
+    const size_t s_embedding       = ctx->embd.capacity() * sizeof(float);
    const size_t s_kv_buf_size     = sizeof(size_t);
    const size_t s_kv_head         = sizeof(uint32_t);
    const size_t s_kv_size         = sizeof(uint32_t);
@ -13329,12 +13457,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
    // copy embeddings
    {
-        const size_t embedding_size = ctx->embedding.size();
+        const size_t embeddings_size = ctx->embd.size();
-        data_ctx->write(&embedding_size, sizeof(embedding_size));
+        data_ctx->write(&embeddings_size, sizeof(embeddings_size));
-        if (embedding_size) {
+        if (embeddings_size) {
-            data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
+            data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
        }
    }
@ -13449,15 +13577,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
    // set embeddings
    {
-        size_t embedding_size;
+        size_t embeddings_size;
-        memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
+        memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
-        GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
+        GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
-        if (embedding_size) {
+        if (embeddings_size) {
-            memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
+            ctx->embd.resize(embeddings_size);
-            inp += embedding_size * sizeof(float);
+
            memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
            inp += embeddings_size * sizeof(float);
        }
    }
@ -13717,11 +13847,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
 }
 float * llama_get_embeddings(struct llama_context * ctx) {
-    return ctx->embedding.data();
+    return ctx->embd.data();
 }
 float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
-    return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
+    return ctx->embd.data() + i*ctx->model.hparams.n_embd;
 }
 float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
    auto it = ctx->embd_seq.find(seq_id);
    if (it == ctx->embd_seq.end()) {
        return nullptr;
    }
    return it->second.data();
 }
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@ -13895,7 +14034,7 @@ static int32_t llama_chat_apply_template_internal(
    std::string & dest, bool add_ass) {
    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
    std::stringstream ss;
-    if (tmpl.find("<|im_start|>") != std::string::npos) {
+    if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
        // chatml template
        for (auto message : chat) {
            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@ -13903,7 +14042,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
        // llama2 template and its variants
        // [variant] support system message
        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@ -13938,7 +14077,7 @@ static int32_t llama_chat_apply_template_internal(
            }
        }
        // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl.find("<|user|>") != std::string::npos) {
+    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
        // zephyr template
        for (auto message : chat) {
            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@ -13946,7 +14085,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
-    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
+    } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
        for (auto message : chat) {
            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@ -13955,7 +14094,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<s>assistant\n";
        }
-    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
        // google/gemma-7b-it
        std::string system_prompt = "";
        for (auto message : chat) {
@ -14002,7 +14141,7 @@ LLAMA_API int32_t llama_chat_apply_template(
        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
        if (res < 0) {
            // worst case: there is no information about template, we will use chatml by default
-            curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
+            curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
        } else {
            curr_tmpl = std::string(model_template.data(), model_template.size());
        }
--- a/llama.h
+++ b/llama.h
@ -163,7 +163,7 @@ extern "C" {
    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
    // - pos    : the positions of the respective token in the sequence
    // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
    //
    typedef struct llama_batch {
        int32_t n_tokens;
@ -173,7 +173,7 @@ extern "C" {
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
-        int8_t       *  logits;
+        int8_t       *  logits; // TODO: rename this to "output"
        // NOTE: helpers for smooth API transition - can be deprecated in the future
        //       for future-proof code, use the above fields instead and ignore everything below
@ -261,7 +261,7 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
+        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        // Abort callback
@ -657,14 +657,20 @@ extern "C" {
    // llama_get_logits(ctx) + i*n_vocab
    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-    // Get the embeddings for the input
+    // Get all output token embeddings
-    // shape: [n_embd] (1-dimensional)
+    // shape: [n_tokens*n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-    // Get the embeddings for the ith sequence
+    // Get the embeddings for the ith token
    // llama_get_embeddings(ctx) + i*n_embd
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
    // Get the embeddings for a sequence id
    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
    // Vocab
    //
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-b458250b736a7473f7ff3560d47c93f1644f3290
+8695910a39102609073d0e099aa7c97d6bcb3bf9
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1412,6 +1412,50 @@ struct test_pad : public test_case {
    }
 };
 // GGML_OP_ARANGE
 struct test_arange : public test_case {
    const ggml_type type;
    const float start;
    const float stop;
    const float step;
    std::string vars() override {
        return VARS_TO_STR4(type, start, stop, step);
    }
    test_arange(ggml_type type = GGML_TYPE_F32,
            float start = 0.f, float stop = 10.f, float step = 1.f)
        : type(type), start(start), stop(stop), step(step)  {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * out = ggml_arange(ctx, start, stop, step);
        return out;
    }
 };
 // GGML_OP_TIMESTEP_EMBEDDING
 struct test_timestep_embedding : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne_a;
    const int dim;
    const int max_period;
    std::string vars() override {
        return VARS_TO_STR4(type, ne_a, dim, max_period);
    }
    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
            int dim = 320, int max_period=10000)
        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
        return out;
    }
 };
 // GGML_OP_LEAKY_RELU
 struct test_leaky_relu : public test_case {
    const ggml_type type;
@ -2126,6 +2170,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_group_norm());
    test_cases.emplace_back(new test_acc());
    test_cases.emplace_back(new test_pad());
    test_cases.emplace_back(new test_arange());
    test_cases.emplace_back(new test_timestep_embedding());
    test_cases.emplace_back(new test_leaky_relu());
    // these tests are disabled to save execution time, but they can be handy for debugging
`@ -1 +1 @@`
	`b458250b736a7473f7ff3560d47c93f1644f3290`	`8695910a39102609073d0e099aa7c97d6bcb3bf9`