Merge branch 'master' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md
2023-09-05 21:49:24 +08:00 · 2023-09-05 21:49:24 +08:00 · a0aa620718
commit a0aa620718
parent b1c66e4ad7 d59bd97065
28 changed files with 3165 additions and 2285 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -17,3 +17,6 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -0,0 +1,36 @@
 name: Code Coverage
 on: [push, pull_request]
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
      - name: Run tests
        run: CC=gcc-8 make test
      - name: Generate coverage report
        run: |
          make coverage
          make lcov-report
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: lcov-report/coverage.info
--- a/Package.swift
+++ b/Package.swift
@ -12,9 +12,18 @@ let package = Package(
            name: "llama",
            path: ".",
            exclude: ["ggml-metal.metal"],
-            sources: ["ggml.c", "llama.cpp"],
+            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "k_quants.c"
            ],
            publicHeadersPath: "spm-headers",
-            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
+            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,14 @@
 comment: off
 coverage:
  status:
    project:
      default:
        target: auto
        threshold: 0
        base: auto
    patch:
      default:
        target: auto
        threshold: 0
        base: auto
--- a/common/common.cpp
+++ b/common/common.cpp
@ -305,6 +305,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_keep = std::stoi(argv[i]);
        } else if (arg == "--draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_draft = std::stoi(argv[i]);
        } else if (arg == "--chunks") {
            if (++i >= argc) {
                invalid_param = true;
@ -317,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
        } else if (arg == "-md" || arg == "--model-draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.model_draft = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
@ -638,6 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stdout, "  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@ -669,6 +682,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stdout, "  -md FNAME, --model-draft FNAME\n");
    fprintf(stdout, "                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    fprintf(stdout, "  -ld LOGDIR, --logdir LOGDIR\n");
    fprintf(stdout, "                        path under which to save YAML logs (no logging if unset)\n");
    fprintf(stdout, "\n");
@ -702,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    if (params.n_gpu_layers != -1) {
        lparams.n_gpu_layers = params.n_gpu_layers;
    }
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
    lparams.low_vram        = params.low_vram;
@ -752,6 +769,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }
    {
        LOG("warming up the model with an empty run\n");
        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
        llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(lctx);
    }
    return std::make_tuple(model, lctx);
 }
@ -824,6 +849,130 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
    return result;
 }
 //
 // Sampling utils
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(ctx);
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    const float   repeat_penalty  = params.repeat_penalty;
    const float   alpha_presence  = params.presence_penalty;
    const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
    const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    float * logits = llama_get_logits(ctx) + idx * n_vocab;
    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    candidates.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
    if (ctx_guidance) {
        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
    }
    // apply penalties
    if (!last_tokens.empty()) {
        const float nl_logit = logits[llama_token_nl(ctx)];
        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
        llama_sample_repetition_penalty(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, repeat_penalty);
        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, alpha_frequency, alpha_presence);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    if (grammar != NULL) {
        llama_sample_grammar(ctx, &cur_p, grammar);
    }
    if (temp <= 0) {
        // Greedy sampling
        id = llama_sample_token_greedy(ctx, &cur_p);
    } else {
        if (mirostat == 1) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            const int mirostat_m = 100;
            llama_sample_temperature(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
        } else if (mirostat == 2) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            llama_sample_temperature(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
        } else {
            // Temperature sampling
            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
            llama_sample_temperature(ctx, &cur_p, temp);
            {
                const int n_top = 10;
                LOG("top %d candidates:\n", n_top);
                for (int i = 0; i < n_top; i++) {
                    const llama_token id = cur_p.data[i].id;
                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                }
            }
            id = llama_sample_token(ctx, &cur_p);
            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
        }
    }
    // printf("`%d`", candidates_p.size);
    if (grammar != NULL) {
        llama_grammar_accept_token(ctx, grammar, id);
    }
    return id;
 }
 //
 // YAML utils
 //
 // returns true if successful, false otherwise
 bool create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
@ -1062,9 +1211,10 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
+    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@ -32,8 +32,9 @@ struct gpt_params {
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
+    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
@ -63,6 +64,7 @@ struct gpt_params {
    float       cfg_scale         = 1.f;   // How strong is guidance
    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@ -156,6 +158,40 @@ std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 //
 // Sampling utils
 //
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 //
 // required:
 //  - ctx:    context to use for sampling
 //  - params: sampling parameters
 //
 // optional:
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
 //  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx = 0);
 //
 // YAML utils
 //
 bool create_directory_with_parents(const std::string & path);
 void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
 void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -58,7 +58,7 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
    return parser.parse_args()
 args = parse_args()
--- a/convert.py
+++ b/convert.py
@ -323,15 +323,27 @@ class BpeVocab:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
                    (item['content'], item['id'])
                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
                    if item['content'] not in self.bpe_tokenizer )
        vocab_size: int = len(self.bpe_tokenizer)
        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+            expected_end_id = vocab_size + len(actual_ids) - 1
            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_list    = [text for (text, idx) in items]
@ -345,10 +357,22 @@ class BpeVocab:
        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
        byte_encoder = tokenization_gpt2.bytes_to_unicode()
        byte_decoder = {v: k for k, v in byte_encoder.items()}
        score = 0.0
        for i, item in enumerate(tokenizer):
            text: bytes = item.encode("utf-8")
-            score: float = -i
+            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
-            yield text, score, gguf.TokenType.USER_DEFINED
+            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
                if i == 0 and text == b'<unk>':
                    toktype = gguf.TokenType.UNKNOWN
                elif i == 1 or i == 2:
                    toktype = gguf.TokenType.CONTROL
                elif i >= 3 and text.startswith(b'<0x'):
                    toktype = gguf.TokenType.BYTE
                else:
                    toktype = gguf.TokenType.NORMAL
            else:
                toktype = gguf.TokenType.NORMAL
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,7 @@ else()
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
 ggml_tensor * gpt_neox_ff(
        const gpt_neox_block &block,
        ggml_context * ctx0,
-        ggml_tensor * inp) {
+        ggml_tensor * inp,
        const gpt_neox_hparams &hparams) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp);
+    ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
    cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
    cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@ -753,7 +754,7 @@ bool gpt_neox_eval(
        // self-attention
        {
            {
-                cur = ggml_norm(ctx0, inpL);
+                cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
                cur = ggml_add(ctx0,
                        ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@ -844,7 +845,7 @@ bool gpt_neox_eval(
        if (hparams.par_res == 0) {
            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
            // input for next layer
            inpL = ggml_add(ctx0, cur, inpFF);
@ -853,7 +854,7 @@ bool gpt_neox_eval(
            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
            // layer input + FF
            cur  = ggml_add(ctx0, cur, inpFF);
@ -867,7 +868,7 @@ bool gpt_neox_eval(
    // norm
    {
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
        // inpL = ln_f_g*inpL + ln_f_b
        inpL = ggml_add(ctx0,
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -151,14 +151,6 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }
    if (params.n_ctx > 2048) {
        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
        LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    if (params.seed == LLAMA_DEFAULT_SEED) {
@ -194,6 +186,13 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (params.n_ctx > llama_n_ctx(ctx)) {
        LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
    } else if (params.n_ctx < 8) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    // print system information
    {
        LOG_TEE("\n");
@ -425,8 +424,9 @@ int main(int argc, char ** argv) {
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
-    llama_grammar *             grammar = NULL;
+
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
@ -450,8 +450,8 @@ int main(int argc, char ** argv) {
    }
    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    if (params.interactive) {
        const char *control_message;
@ -492,13 +492,10 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
-    {
+    const int n_vocab = llama_n_vocab(ctx);
        LOG("warming up the model with an empty run\n");
-        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
+    std::vector<llama_token_data> candidates;
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+    candidates.reserve(n_vocab);
        llama_reset_timings(ctx);
    }
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@ -537,8 +534,8 @@ int main(int argc, char ** argv) {
                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
@ -637,20 +634,6 @@ int main(int argc, char ** argv) {
        embd_guidance.clear();
        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            const float   temp            = params.temp;
            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
            const float   top_p           = params.top_p;
            const float   tfs_z           = params.tfs_z;
            const float   typical_p       = params.typical_p;
            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
            const float   repeat_penalty  = params.repeat_penalty;
            const float   alpha_presence  = params.presence_penalty;
            const float   alpha_frequency = params.frequency_penalty;
            const int     mirostat        = params.mirostat;
            const float   mirostat_tau    = params.mirostat_tau;
            const float   mirostat_eta    = params.mirostat_eta;
            const bool    penalize_nl     = params.penalize_nl;
            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
@ -659,98 +642,12 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }
-            llama_token id = 0;
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
-            {
+            last_tokens.erase(last_tokens.begin());
-                auto logits  = llama_get_logits(ctx);
+            last_tokens.push_back(id);
                auto n_vocab = llama_n_vocab(ctx);
-                // Apply params.logit_bias map
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
                    logits[it->first] += it->second;
                }
                std::vector<llama_token_data> candidates;
                candidates.reserve(n_vocab);
                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                }
                llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
                if (ctx_guidance) {
                    llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
                }
                // Apply penalties
                float nl_logit = logits[llama_token_nl(ctx)];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &cur_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, repeat_penalty);
                llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
                    for (size_t idx = 0; idx < cur_p.size; idx++) {
                        if (cur_p.data[idx].id == llama_token_nl(ctx)) {
                            cur_p.data[idx].logit = nl_logit;
                            break;
                        }
                    }
                }
                if (grammar != NULL) {
                    llama_sample_grammar(ctx, &cur_p, grammar);
                }
                if (temp <= 0) {
                    // Greedy sampling
                    id = llama_sample_token_greedy(ctx, &cur_p);
                } else {
                    if (mirostat == 1) {
                        static float mirostat_mu = 2.0f * mirostat_tau;
                        const int mirostat_m = 100;
                        llama_sample_temperature(ctx, &cur_p, temp);
                        id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
                    } else if (mirostat == 2) {
                        static float mirostat_mu = 2.0f * mirostat_tau;
                        llama_sample_temperature(ctx, &cur_p, temp);
                        id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                    } else {
                        // Temperature sampling
                        llama_sample_top_k      (ctx, &cur_p, top_k, 1);
                        llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
                        llama_sample_typical    (ctx, &cur_p, typical_p, 1);
                        llama_sample_top_p      (ctx, &cur_p, top_p, 1);
                        llama_sample_temperature(ctx, &cur_p, temp);
                        {
                            const int n_top = 10;
                            LOG("top %d candidates:\n", n_top);
                            for (int i = 0; i < n_top; i++) {
                                const llama_token id = cur_p.data[i].id;
                                LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                            }
                        }
                        id = llama_sample_token(ctx, &cur_p);
                        LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
                    }
                }
                // printf("`%d`", candidates_p.size);
                if (grammar != NULL) {
                    llama_grammar_accept_token(ctx, grammar, id);
                }
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
                LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_n_tokens));
            }
            embd.push_back(id);
@ -766,8 +663,8 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
+                last_tokens.erase(last_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
+                last_tokens.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -800,7 +697,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt
            if (params.antiprompt.size()) {
                std::string last_output;
-                for (auto id : last_n_tokens) {
+                for (auto id : last_tokens) {
                    last_output += llama_token_to_piece(ctx, id);
                }
@ -831,7 +728,7 @@ int main(int argc, char ** argv) {
            }
            // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+            if (last_tokens.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");
                if (params.interactive) {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = std::min(512, params.n_ctx/2);
+        const int first = params.n_ctx/2;
        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        count += params.n_ctx - first - 1;
@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
        params.n_ctx += params.ppl_stride/2;
    }
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
    }
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    if (params.seed == LLAMA_DEFAULT_SEED) {
@ -698,6 +693,11 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (params.n_ctx > llama_n_ctx(ctx)) {
        fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
                "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
    }
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -145,7 +145,29 @@
      color: #888;
    }
    @keyframes loading-bg-wipe {
      0% {
        background-position: 0%;
      }
      100% {
        background-position: 100%;
      }
    }
    .loading {
      --loading-color-1: #eeeeee00;
      --loading-color-2: #eeeeeeff;
      background-size: 50% 100%;
      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
      animation: loading-bg-wipe 2s linear infinite;
    }
    @media (prefers-color-scheme: dark) {
      .loading {
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
      .popover-content {
        background-color: black;
      }
@ -321,7 +343,10 @@
    const llamaStats = signal(null)
    const controller = signal(null)
-    const generating = computed(() => controller.value == null )
+    // currently generating a completion?
    const generating = computed(() => controller.value != null)
    // has the user started a chat?
    const chatStarted = computed(() => session.value.transcript.length > 0)
    const transcriptUpdate = (transcript) => {
@ -430,11 +455,19 @@
      return html`
        <form onsubmit=${submit}>
          <div>
-            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
+            <textarea
               className=${generating.value ? "loading" : null}
               oninput=${(e) => message.value = e.target.value}
               onkeypress=${enterSubmits}
               placeholder="Say something..."
               rows=2
               type="text"
               value="${message}"
            />
          </div>
          <div class="right">
-            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${stop} disabled=${generating}>Stop</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
        </form>
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -0,0 +1,292 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
 #include "grammar-parser.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv) {
    gpt_params params;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (params.model_draft.empty()) {
        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
        return 1;
    }
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
    llama_backend_init(params.numa);
    llama_model * model_tgt = NULL;
    llama_model * model_dft = NULL;
    llama_context * ctx_tgt = NULL;
    llama_context * ctx_dft = NULL;
    // load the target model
    params.perplexity = true; // HACK: enable logits_all = true
    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
    // load the draft model
    params.model = params.model_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
    // tokenize the prompt
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
    const int max_context_size     = llama_n_ctx(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;
    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }
    fprintf(stderr, "\n\n");
    for (auto id : inp) {
        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
    }
    fflush(stderr);
    const int n_input = inp.size();
    const auto t_enc_start = ggml_time_us();
    // eval the prompt with both models
    llama_eval(ctx_tgt,  inp.data(), int(inp.size() - 1), 0, params.n_threads);
    llama_eval(ctx_tgt, &inp.back(),      1, inp.size() - 1, params.n_threads);
    llama_eval(ctx_dft,  inp.data(),     int(inp.size()), 0, params.n_threads);
    const auto t_enc_end = ggml_time_us();
    // the 2 models should have the same vocab
    const int n_ctx   = llama_n_ctx(ctx_tgt);
    const int n_vocab = llama_n_vocab(ctx_tgt);
    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
    // how many tokens to draft each time
    const int n_draft = params.n_draft;
    int n_predict = 0;
    int n_drafted = 0;
    int n_accept  = 0;
    int n_past_tgt = inp.size();
    int n_past_dft = inp.size();
    std::vector<llama_token> drafted;
    std::vector<llama_token> last_tokens(n_ctx);
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    for (auto & id : inp) {
        last_tokens.erase(last_tokens.begin());
        last_tokens.push_back(id);
    }
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    // used to determine end of generation
    bool has_eos = false;
    // grammar stuff
    struct llama_grammar * grammar_dft = NULL;
    struct llama_grammar * grammar_tgt = NULL;
    grammar_parser::parse_state parsed_grammar;
    // if requested - load the grammar, error checking is omitted for brevity
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
    const auto t_dec_start = ggml_time_us();
    while (true) {
        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
        int i_dft = 0;
        while (true) {
            // sample from the target model
            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
            // remember which tokens were sampled - used for repetition penalties during sampling
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);
            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
            printf("%s", token_str.c_str());
            fflush(stdout);
            if (id == llama_token_eos(ctx_tgt)) {
                has_eos = true;
            }
            ++n_predict;
            // check if the draft matches the target
            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                ++n_accept;
                ++n_past_tgt;
                ++n_past_dft;
                ++i_dft;
                continue;
            }
            // the drafted token was rejected or we are out of drafted tokens
            if (i_dft < (int) drafted.size()) {
                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
            } else {
                LOG("out of drafted tokens\n");
            }
            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
            ++n_past_dft;
            drafted.clear();
            drafted.push_back(id);
            break;
        }
        if (n_predict > params.n_predict || has_eos) {
            break;
        }
        if (grammar_tgt) {
            if (grammar_dft) {
                llama_grammar_free(grammar_dft);
            }
            grammar_dft = llama_grammar_copy(grammar_tgt);
            LOG("copied target grammar to draft grammar\n");
        }
        // sample n_draft tokens from the draft model using greedy decoding
        int n_past_cur = n_past_dft;
        for (int i = 0; i < n_draft; ++i) {
            float * logits = llama_get_logits(ctx_dft);
            candidates.clear();
            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
            }
            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
            if (grammar_dft != NULL) {
                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
            }
            // computes softmax and sorts the candidates
            llama_sample_softmax(ctx_dft, &cur_p);
            for (int i = 0; i < 3; ++i) {
                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
            }
            // TODO: better logic?
            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
                break;
            }
            // drafted token
            const llama_token id = cur_p.data[0].id;
            drafted.push_back(id);
            ++n_drafted;
            // no need to evaluate the last drafted token, since we won't use the result
            if (i == n_draft - 1) {
                break;
            }
            // evaluate the drafted token on the draft model
            llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
            ++n_past_cur;
            if (grammar_dft != NULL) {
                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
            }
        }
        // evaluate the target model on the drafted tokens
        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
        ++n_past_tgt;
        // the first token is always proposed by the traget model before the speculation loop
        drafted.erase(drafted.begin());
    }
    auto t_dec_end = ggml_time_us();
    LOG_TEE("\n\n");
    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
    // TODO: make sure these numbers are computed correctly
    LOG_TEE("\n");
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_drafted = %d\n", n_drafted);
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
    LOG_TEE("\ndraft:\n");
    llama_print_timings(ctx_dft);
    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx_tgt);
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);
    llama_free(ctx_dft);
    llama_free_model(model_dft);
    if (grammar_dft != NULL) {
        llama_grammar_free(grammar_dft);
        llama_grammar_free(grammar_tgt);
    }
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -1,3 +1,8 @@
 // defines MAP_ANONYMOUS
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
@ -6,6 +11,26 @@
 #include <stdlib.h>
 #include <string.h>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/types.h>
            #include <sys/mman.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <memoryapi.h>
 #endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
-
+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);
    UNUSED(alloc);
 }
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 #endif
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+    if (ggml_allocr_is_own(alloc, tensor) == false) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        return;
    }
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@ -281,24 +311,64 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }
-// address and size of the buffer when measuring
+// OS specific functions to allocate and free uncommitted virtual memory
-// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * alloc_vmem(size_t size) {
-static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+#if defined(_WIN32)
-#if defined(__ARM_NEON) && !defined(__aarch64__)
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-// 32-bit
+#elif defined(_POSIX_MAPPED_FILES)
-// TODO: Use for 32-bit x86 as well
+    return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
 static const size_t MEASURE_MAX_SIZE  = (1ULL<<32) - 1; // 4 GB
 #else
-// 64-bit
+    // use a fixed address for other platforms
-static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+    uintptr_t base_addr = (uintptr_t)-size - 0x100;
    return (void *)base_addr;
 #endif
 }
 static void free_vmem(void * base_addr, size_t size) {
 #if defined(_WIN32)
    VirtualFree(base_addr, 0, MEM_RELEASE);
    UNUSED(size);
 #elif defined(_POSIX_MAPPED_FILES)
    munmap(base_addr, size);
 #else
    // nothing to do
    UNUSED(base_addr);
    UNUSED(size);
 #endif
 }
 // allocate uncommitted virtual memory to measure the size of the graph
 static void alloc_measure_vmem(void ** base_addr, size_t * size) {
    // 1TB for 64-bit, 1GB for 32-bit
    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
    do {
        *base_addr = alloc_vmem(*size);
        if (*base_addr != NULL) {
            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
            return;
        }
        // try again with half the size
        *size /= 2;
    } while (*size > 0);
    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
 }
 static void free_measure_vmem(void * base_addr, size_t size) {
    free_vmem(base_addr, size);
 }
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
    void * base_addr;
    size_t size;
    alloc_measure_vmem(&base_addr, &size);
    *alloc = (struct ggml_allocr){
-        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.data          = */ base_addr,
-        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.size          = */ size,
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@ -318,6 +388,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
    if (alloc->measure) {
        free_measure_vmem(alloc->data, alloc->size);
    }
    free(alloc);
 }
@ -387,8 +460,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    }
                    // if the node's data is external, then we cannot re-use it
-                    if ((char *) parent->data < (char *) alloc->data ||
+                    if (ggml_allocr_is_own(alloc, parent) == false) {
                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                        continue;
                    }
@ -422,7 +494,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }
-static size_t ggml_allocator_alloc_graph_tensors_n(
+static size_t ggml_allocr_alloc_graph_tensors_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -500,7 +572,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                AT_PRINTF("\n");
            }
            // update parents
            // update immediately if there is no parse_seq
            // update only at barriers if there is parse_seq
@ -528,12 +599,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                                view_src_hn->n_views -= 1;
                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, view_src);
+                                    ggml_allocr_free_tensor(alloc, view_src);
                                }
                            }
                            else {
                                if (parent->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, parent);
+                                    ggml_allocr_free_tensor(alloc, parent);
                                }
                            }
                        }
@ -550,7 +621,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
            for (int i = 0; outputs[g][i] != NULL; i++) {
                struct ggml_tensor * output = outputs[g][i];
                AT_PRINTF("output: %s\n", output->name);
-                ggml_allocator_free_tensor(alloc, output);
+                ggml_allocr_free_tensor(alloc, output);
            }
        }
    }
@ -559,5 +630,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -464,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
 }
 template <int block_size>
 static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
    const float eps = 1e-5f;
-    float mean = 0.0f;
+    float2 mean_var = make_float2(0.f, 0.f);
    float var = 0.0f;
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
-        mean += xi;
+        mean_var.x += xi;
-        var += xi * xi;
+        mean_var.y += xi * xi;
    }
    // sum up partial sums
    mean_var = warp_reduce_sum(mean_var);
    if (block_size > WARP_SIZE) {
        __shared__ float2 s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = mean_var;
        }
        __syncthreads();
        mean_var = s_sum[lane_id];
        mean_var = warp_reduce_sum(mean_var);
    }
    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
    const float inv_std = rsqrtf(var + eps);
    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
    }
 }
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
-        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    }
-    }
+    return x;
    mean /= ncols;
    var = var / ncols - mean * mean;
    const float inv_var = rsqrtf(var + eps);
    for (int col = tid; col < ncols; col += WARP_SIZE) {
        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
    }
 }
 template <int block_size>
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
    float tmp = 0.0f; // partial sum for thread in warp
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }
    // sum up partial sums
-#pragma unroll
+    tmp = warp_reduce_sum(tmp);
-    for (int mask = 16; mask > 0; mask >>= 1) {
+    if (block_size > WARP_SIZE) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }
    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
 }
@ -4203,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
 static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
-    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
    } else {
        const dim3 block_dims(1024, 1, 1);
        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
    }
 }
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
-    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
 }
 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -76,6 +76,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -219,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -284,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -868,7 +871,11 @@ void ggml_metal_graph_compute(
                                        {
                                            nth0 = 32;
                                            nth1 = 1;
                                            if (ne11 * ne12 < 4) {
                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
                                            } else {
                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                            }
                                        } break;
                                    case GGML_TYPE_Q4_0:
                                        {
@ -920,8 +927,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 2;
+                                            nth0 = 4; //1;
-                                            nth1 = 32;
+                                            nth1 = 8; //32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@ -969,9 +976,12 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -985,8 +995,8 @@ void ggml_metal_graph_compute(
                                else if (src0t == GGML_TYPE_Q6_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
-                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    int64_t ny = (ne11 + 3)/4;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -133,19 +133,24 @@ kernel void kernel_soft_max(
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
-    if (tpitg[0] == 0) {
+    //               the loop, and when that is done, buf[0] has the correct (synchronized) value
-        buf[0] = buf[0];
+    //if (tpitg[0] == 0) {
-    }
+    //    buf[0] = buf[0];
    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float max = buf[0];
    // parallel sum
    buf[tpitg[0]] = 0.0f;
    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        buf[tpitg[0]] += exp(psrc0[i00] - max);
+        const float exp_psrc0 = exp(psrc0[i00] - max);
        buf[tpitg[0]] += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
        // whish to compute it twice.
        pdst[i00] = exp_psrc0;
    }
    // reduce
@ -157,17 +162,18 @@ kernel void kernel_soft_max(
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    // broadcast - not needed, see above
-    if (tpitg[0] == 0) {
+    //// broadcast
-        buf[0] = buf[0];
+    //if (tpitg[0] == 0) {
-    }
+    //    buf[0] = buf[0];
    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float sum = buf[0];
    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        pdst[i00] = exp(psrc0[i00] - max) / sum;
+        pdst[i00] /= sum;
    }
 }
@ -214,25 +220,27 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast
-    if (tpitg == 0) {
+    //if (tpitg == 0) {
-        sum[0] /= ne00;
+    //    sum[0] /= ne00;
-    }
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float mean  = sum[0];
-    // recenter
+    // recenter and VARIANCE
    device float * y = dst + tgpig*ne00;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        y[i00] = x[i00] - mean;
    }
    // VARIANCE
    // parallel sum
    sum[tpitg] = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        y[i00] = x[i00] - mean;
        sum[tpitg] += y[i00] * y[i00];
    }
    //// VARIANCE
    //// parallel sum
    //sum[tpitg] = 0.0f;
    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
    //    sum[tpitg] += y[i00] * y[i00];
    //}
    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (uint i = ntg/2; i > 0; i /= 2) {
@ -241,11 +249,11 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast
-    if (tpitg == 0) {
+    //if (tpitg == 0) {
-        sum[0] /= ne00;
+    //    sum[0] /= ne00;
-    }
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float variance = sum[0];
    const float scale = 1.0f/sqrt(variance + eps);
@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 #define NB_Q8_0 8
 kernel void kernel_mul_mat_q8_0_f32(
        device const  void * src0,
        device const float * src1,
@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-    float yl[16];
+    float yl[NB_Q8_0];
    float sumf[nr]={0.f};
-    const int ix = tiisg/2;
+    const int ix = tiisg/4;
-    const int il = tiisg%2;
+    const int il = tiisg%4;
-    device const float * yb = y + ix * QK8_0 + 16*il;
+    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
-    // each thread in a SIMD group deals with half a block.
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
-    for (int ib = ix; ib < nb; ib += nw/2) {
+    for (int ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < 16; ++i) {
+        for (int i = 0; i < NB_Q8_0; ++i) {
            yl[i] = yb[i];
        }
        for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = x[ib+row*nb].qs + 16*il;
+            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
            float sumq = 0.f;
-            for (int iq = 0; iq < 16; ++iq) {
+            for (int iq = 0; iq < NB_Q8_0; ++iq) {
                sumq += qs[iq] * yl[iq];
            }
            sumf[row] += sumq*x[ib+row*nb].d;
        }
-        yb += QK8_0 * 16;
+        yb += NB_Q8_0 * nw;
    }
    for (int row = 0; row < nr; ++row) {
@ -497,6 +507,60 @@ kernel void kernel_mul_mat_q8_0_f32(
    }
 }
 kernel void kernel_mul_mat_f16_f32_1row(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]]) {
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
    const int64_t im = tgpig.z;
    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
    float sumf = 0;
    if (ne00 < 128) {
        for (int i = tiisg; i < ne00; i += 32) {
            sumf += (float) x[i] * (float) y[i];
        }
        float all_sum = simd_sum(sumf);
        if (tiisg == 0) {
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    } else {
        device const half4  * x4 = (device const half4  *) x;
        device const float4 * y4 = (device const float4 *) y;
        for (int i = tiisg; i < ne00/4; i += 32) {
            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
        }
        float all_sum = simd_sum(sumf);
        if (tiisg == 0) {
            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    }
 }
 #define N_F16_F32 4
 kernel void kernel_mul_mat_f16_f32(
        device const  char * src0,
        device const  char * src1,
@ -515,55 +579,58 @@ kernel void kernel_mul_mat_f16_f32(
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        threadgroup float  * sum [[threadgroup(0)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tpig[[thread_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3  tptg[[threads_per_threadgroup]]) {
    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
+    const int64_t rb = tgpig.y*N_F16_F32;
    const int64_t im = tgpig.z;
    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    if (ne00 < 128) {
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-    uint ith = tpitg.x;
+            float sumf = 0;
-    uint nth = tptg.x;
+            for (int i = tiisg; i < ne00; i += 32) {
-
+                sumf += (float) x[i] * (float) y[i];
    sum[ith] = 0.0f;
    for (int i = ith; i < ne00; i += nth) {
        sum[ith] += (float) x[i] * (float) y[i];
            }
-    // accumulate the sum from all threads in the threadgroup
+            float all_sum = simd_sum(sumf);
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            if (tiisg == 0) {
-    if (ith%4 == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
+            }
        }
    } else {
        device const half4 * x4 = (device const half4 *)x;
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
            device const float4 * y4 = (device const float4 *) y;
            float sumf = 0;
            for (int i = tiisg; i < ne00/4; i += 32) {
                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
            }
            float all_sum = simd_sum(sumf);
            if (tiisg == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith%16 == 0) {
        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
        }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith == 0) {
        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
        dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
    }
    // Original implementation. Left behind commented out for now
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    //for (uint i = tptg.x/2; i > 0; i /= 2) {
    //    if (tpitg.x < i) {
    //        sum[tpitg.x] += sum[tpitg.x + i];
    //    }
    //    threadgroup_barrier(mem_flags::mem_threadgroup);
    //}
    //
    //if (tpitg.x == 0) {
    //    dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
    //}
 }
 kernel void kernel_alibi_f32(
@ -1262,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int r2 = tgpig.z;
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
    const int first_row = r0 * N_DST;
    const int ib_row = first_row * nb;
    const uint offset0 = r2/gqa*(nb*ne0);
    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1342,7 +1342,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
        return;
    }
-    cl_mem mem = (cl_mem)tensor->data;
+    cl_mem mem = (cl_mem)tensor->extra;
    clReleaseMemObject(mem);
 }
@ -1401,7 +1401,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    size_t d_size;
    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
@ -1499,9 +1499,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@ -1576,7 +1576,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
    }
@ -1707,7 +1707,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->data;
+                d_Q = (cl_mem) src0->extra;
            } else {
                GGML_ASSERT(false);
            }
@ -1880,6 +1880,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    CL_CHECK(clFinish(queue));
-    tensor->data = dst;
+    tensor->extra = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -801,7 +801,7 @@ class SpecialVocab:
            else:
                continue
            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
-                if isinstance(maybe_token_id, int):
+                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
                    self.special_token_ids[typ] = maybe_token_id
                break
        return True
@ -814,7 +814,7 @@ class SpecialVocab:
            config = json.load(f)
        for typ in self.special_token_types:
            maybe_token_id = config.get(f'{typ}_token_id')
-            if isinstance(maybe_token_id, int):
+            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
                self.special_token_ids[typ] = maybe_token_id
        return True
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.3.1"
+version = "0.3.2"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@ -0,0 +1,34 @@
 # This is the same as json.gbnf but we restrict whitespaces at the end of the root array
 # Useful for generating JSON arrays
 root   ::= arr
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
 arr  ::=
  "[\n" ws (
            value
    (",\n" ws value)*
  )? "]"
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws
 string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
 number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
--- a/k_quants.c
+++ b/k_quants.c
@ -83,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        float ax = fabsf(x[i]);
        if (ax > amax) { amax = ax; max = x[i]; }
    }
-    if (!amax) { // all zero
+    if (amax < 1e-30f) { // all zero
        for (int i = 0; i < n; ++i) {
            L[i] = 0;
        }
@ -1086,6 +1086,12 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
        }
        if (!max_abs_scale) {
            memset(&y[i], 0, sizeof(block_q6_K));
            y[i].d = ggml_fp32_to_fp16(0.f);
            continue;
        }
        float iscale = -128.f/max_scale;
        y[i].d = ggml_fp32_to_fp16(1/iscale);
        for (int ib = 0; ib < QK_K/16; ++ib) {
--- a/llama.cpp
+++ b/llama.cpp
@ -326,6 +326,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_GPT2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
    {
        LLM_ARCH_GPTJ,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
    {
        LLM_ARCH_GPTNEOX,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_MPT,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
    {
        LLM_ARCH_UNKNOWN,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
 };
 static llm_arch llm_arch_from_string(const std::string & name) {
@ -1611,10 +1649,14 @@ static void llm_load_hparams(
        GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
            }
        }
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
        // gpt-j n_rot = rotary_dim
    }
    // arch-specific KVs
    switch (model.arch) {
@ -2910,7 +2952,12 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
    //       with the BLAS calls. need a better solution
    if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
        n_threads = std::min(4, n_threads);
    }
    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@ -3334,10 +3381,16 @@ struct llm_tokenizer_bpe {
                        std::string byte_str(1, *j);
                        auto token_multibyte = vocab.token_to_id.find(byte_str);
                        if (token_multibyte == vocab.token_to_id.end()) {
                            try {
                                llama_token token_byte = llama_byte_to_token(vocab, *j);
                                output.push_back(token_byte);
                            } catch (const std::out_of_range & err) {
                                fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
                            }
                        } else {
                            output.push_back((*token_multibyte).second);
                        }
                    }
                } else {
                    output.push_back((*token).second);
                }
@ -3812,6 +3865,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
    delete grammar;
 }
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
    // redirect elements in stacks to point to new rules
    for (size_t is = 0; is < result->stacks.size(); is++) {
        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
                    }
                }
            }
        }
    }
    return result;
 }
 //
 // sampling
 //
@ -5305,7 +5377,7 @@ struct llama_context_params llama_context_default_params() {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 512,
-        /*.gpu_layers                  =*/ 0,
+        /*.n_gpu_layers                =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
        /*.rope_freq_base              =*/ 10000.0f,
@ -5322,6 +5394,10 @@ struct llama_context_params llama_context_default_params() {
        /*.embedding                   =*/ false,
    };
 #ifdef GGML_USE_METAL
    result.n_gpu_layers = 1;
 #endif
    return result;
 }
@ -5514,7 +5590,6 @@ struct llama_context * llama_new_context_with_model(
            }
 #endif
        }
    }
 #ifdef GGML_USE_METAL
        if (params.n_gpu_layers > 0) {
@ -5551,6 +5626,7 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
        }
 #endif
    }
 #ifdef GGML_USE_MPI
    ctx->ctx_mpi = ggml_mpi_init();
--- a/llama.h
+++ b/llama.h
@ -410,6 +410,8 @@ extern "C" {
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
    //
    // Sampling functions
    //