Merge remote-tracking branch 'origin/master' into vulkan

2023-10-25 18:27:24 +02:00 · 2023-10-25 18:27:24 +02:00 · d130fe6d6b
commit d130fe6d6b
parent 0230981649 6961c4bd0b
87 changed files with 6813 additions and 8594 deletions
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@ -1,8 +1,7 @@
 ---
-name: Issue and enhancement template
+name: Bug template
-about: Used to report issues and request enhancements for llama.cpp
+about: Used to report bugs in llama.cpp
-title: "[User] Insert summary of your issue or enhancement.."
+labels: ["bug"]
 labels: ''
 assignees: ''
 ---
@ -46,7 +45,7 @@ $ g++ --version
 # Failure Information (for bugs)
-Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+Please help provide information about the failure / bug.
 # Steps to Reproduce
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@ -0,0 +1,28 @@
 ---
 name: Enhancement template
 about: Used to request enhancements for llama.cpp
 labels: ["enhancement"]
 assignees: ''
 ---
 # Prerequisites
 Please answer the following questions for yourself before submitting an issue.
 - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
 - [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
 - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
 - [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
 # Feature Description
 Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
 # Motivation
 Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
 # Possible Implementation
 If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@
 *.gcno
 *.gcda
 *.dot
 *.bat
 *.metallib
 .DS_Store
 .build/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -332,6 +332,7 @@ if (LLAMA_CUBLAS)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
--- a/15
+++ b/15
@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
@ -560,7 +560,7 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
-COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o
+COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -620,15 +620,8 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -11,12 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
+- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio
  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
 ----
@ -89,21 +85,23 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
+- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
+- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 **Bindings:**
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@ -206,7 +204,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 ## Usage
-Here are the steps for the LLaMA-7B model.
+Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
 ### Get the Code
@ -573,6 +571,18 @@ python3 convert.py models/7B/
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 ### Running on Windows with prebuilt binaries
 You will find prebuilt Windows binaries on the release page.
 Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
 From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
 ```
 .\main -m llama-2-7b.Q4_0.gguf -n 128
 ```
 ### Memory/Disk Requirements
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
@ -952,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [main](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [embd-input](./examples/embd-input/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
--- a/build.zig
+++ b/build.zig
@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
--- a/common/common.cpp
+++ b/common/common.cpp
@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.repeat_last_n = std::stoi(argv[i]);
+            sparams.penalty_last_n = std::stoi(argv[i]);
            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.repeat_penalty = std::stof(argv[i]);
+            sparams.penalty_repeat = std::stof(argv[i]);
        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.frequency_penalty = std::stof(argv[i]);
+            sparams.penalty_freq = std::stof(argv[i]);
        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.presence_penalty = std::stof(argv[i]);
+            sparams.penalty_present = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.grammar = argv[i];
+            sparams.grammar = argv[i];
        } else if (arg == "--grammar-file") {
            if (++i >= argc) {
                invalid_param = true;
@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            std::copy(
                std::istreambuf_iterator<char>(file),
                std::istreambuf_iterator<char>(),
-                std::back_inserter(params.grammar)
+                std::back_inserter(sparams.grammar)
            );
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
@ -631,6 +632,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
        process_escapes(sparams.cfg_negative_prompt);
        for (auto & antiprompt : params.antiprompt) {
            process_escapes(antiprompt);
        }
@ -640,7 +642,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
@ -678,10 +680,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
    printf("  --mirostat N          use Mirostat sampling.\n");
    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@ -820,6 +822,27 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return cparams;
 }
 void llama_batch_clear(struct llama_batch & batch) {
    batch.n_tokens = 0;
 }
 void llama_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
                          llama_pos   pos,
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits) {
    batch.token   [batch.n_tokens] = id;
    batch.pos     [batch.n_tokens] = pos,
    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
    for (size_t i = 0; i < seq_ids.size(); ++i) {
        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
    }
    batch.logits  [batch.n_tokens] = logits;
    batch.n_tokens++;
 }
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);
@ -857,13 +880,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }
    if (params.ignore_eos) {
-        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
    }
    {
        LOG("warming up the model with an empty run\n");
-        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_tokens_rm(lctx, -1, -1);
        llama_reset_timings(lctx);
@ -879,21 +902,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos) {
+                        bool   add_bos,
-    return llama_tokenize(llama_get_model(ctx), text, add_bos);
+                        bool   special) {
    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
 }
 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos) {
+                        bool   add_bos,
                        bool   special) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -916,7 +941,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
 }
 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(ctx);
+    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
    std::string piece;
    std::string result;
@ -1100,28 +1125,28 @@ std::string get_sortable_timestamp() {
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
+    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
+    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
+    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
+    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
+    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
+    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
+    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
@ -1155,13 +1180,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
-    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
+    dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
@ -1215,14 +1240,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
--- a/common/common.h
+++ b/common/common.h
@ -56,7 +56,7 @@ struct gpt_params {
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
    // // sampling parameters
-    struct llama_sampling_params sampling_params;
+    struct llama_sampling_params sparams;
    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
@ -66,10 +66,10 @@ struct gpt_params {
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
@ -124,10 +124,23 @@ void process_escapes(std::string& input);
 // Model utils
 //
 // TODO: avoid tuplue, use struct
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
-struct llama_model_params   llama_model_params_from_gpt_params(const gpt_params & params);
+
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 // Batch utils
 void llama_batch_clear(struct llama_batch & batch);
 void llama_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
                          llama_pos   pos,
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);
 //
 // Vocab utils
 //
@ -137,12 +150,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
                        bool   special = false);
 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
                        bool   special = false);
 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -399,7 +399,7 @@ namespace grammar_parser {
    void print_grammar(FILE * file, const parse_state & state) {
        try {
            std::map<uint32_t, std::string> symbol_id_names;
-            for (auto kv : state.symbol_ids) {
+            for (const auto & kv : state.symbol_ids) {
                symbol_id_names[kv.second] = kv.first;
            }
            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
--- a/common/log.h
+++ b/common/log.h
@ -97,22 +97,23 @@
    #define LOG_TEE_TARGET stderr
 #endif
 // NOTE: currently disabled as it produces too many log files
 // Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
+//inline std::string log_get_pid()
-{
+//{
-    static std::string pid;
+//    static std::string pid;
-    if (pid.empty())
+//    if (pid.empty())
-    {
+//    {
-        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+//        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-        //  it's not the same as "pid" but is unique enough to solve multiple instances
+//        //  it's not the same as "pid" but is unique enough to solve multiple instances
-        //  trying to write to the same log.
+//        //  trying to write to the same log.
-        std::stringstream ss;
+//        std::stringstream ss;
-        ss << std::this_thread::get_id();
+//        ss << std::this_thread::get_id();
-        pid = ss.str();
+//        pid = ss.str();
-    }
+//    }
-
+//
-    return pid;
+//    return pid;
-}
+//}
 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
    std::stringstream buf;
    buf << log_file_basename;
-    buf << ".";
+    //buf << ".";
-    buf << log_get_pid();
+    //buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;
@ -579,38 +580,75 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
    return buf.str();
 }
-#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
+template <typename C, typename T>
-    [&tokens, &ctx]()                                                        \
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-    {                                                                        \
+{
-        std::stringstream buf;                                               \
+    std::stringstream buf;
-        buf << "[ ";                                                         \
+    buf << "[ ";
-                                                                             \
+
-        bool first = true;                                                   \
+    bool first = true;
-        for (const auto &token : tokens)                                     \
+    for (const auto &token : tokens)
-        {                                                                    \
+    {
-            if (!first)                                                      \
+        if (!first) {
-                buf << ", ";                                                 \
+            buf << ", ";
-            else                                                             \
+        } else {
-                first = false;                                               \
+            first = false;
-                                                                             \
+        }
-            auto detokenized = llama_token_to_piece(ctx, token);             \
+
-                                                                             \
+        auto detokenized = llama_token_to_piece(ctx, token);
-            detokenized.erase(                                               \
+
-                std::remove_if(                                              \
+        detokenized.erase(
-                    detokenized.begin(),                                     \
+            std::remove_if(
-                    detokenized.end(),                                       \
+                detokenized.begin(),
-                    [](const unsigned char c) { return !std::isprint(c); }), \
+                detokenized.end(),
-                detokenized.end());                                          \
+                [](const unsigned char c) { return !std::isprint(c); }),
-                                                                             \
+            detokenized.end());
-            buf                                                              \
+
-                << "'" << detokenized << "'"                                 \
+        buf
-                << ":" << std::to_string(token);                             \
+            << "'" << detokenized << "'"
-        }                                                                    \
+            << ":" << std::to_string(token);
-        buf << " ]";                                                         \
+    }
-                                                                             \
+    buf << " ]";
-        return buf.str();                                                    \
+
-    }()                                                                      \
+    return buf.str();
-        .c_str()
+}
 template <typename C, typename B>
 inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
 {
    std::stringstream buf;
    buf << "[ ";
    bool first = true;
    for (int i = 0; i < batch.n_tokens; ++i)
    {
        if (!first) {
            buf << ", ";
        } else {
            first = false;
        }
        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
        detokenized.erase(
            std::remove_if(
                detokenized.begin(),
                detokenized.end(),
                [](const unsigned char c) { return !std::isprint(c); }),
            detokenized.end());
        buf
            << "\n" << std::to_string(i)
            << ":token '" << detokenized << "'"
            << ":pos " << std::to_string(batch.pos[i])
            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
            << ":seq_id " << std::to_string(batch.seq_id[i][0])
            << ":logits " << std::to_string(batch.logits[i]);
    }
    buf << " ]";
    return buf.str();
 }
 #ifdef LOG_DISABLE_LOGS
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -1,113 +1,161 @@
 #include "sampling.h"
-llama_sampling_context::~llama_sampling_context() {
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    for (auto & it : sequence_contexts) {
+    struct llama_sampling_context * result = new llama_sampling_context();
-        if (it.second.grammar != NULL) {
+
-            llama_grammar_free(it.second.grammar);
+    result->params  = params;
-            it.second.grammar = NULL;
+    result->grammar = nullptr;
    // if there is a grammar, parse it
    if (!params.grammar.empty()) {
        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (result->parsed_grammar.rules.empty()) {
            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
            return nullptr;
        }
        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
        result->grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
    }
    result->prev.resize(params.n_prev);
    return result;
 }
-llama_sampling_context llama_sampling_context_init(
+void llama_sampling_free(struct llama_sampling_context * ctx) {
-        const struct gpt_params & params,
+    if (ctx->grammar != NULL) {
-                  llama_grammar * grammar) {
+        llama_grammar_free(ctx->grammar);
-  llama_sampling_context result;
+    }
-  result.params = params.sampling_params;
+    delete ctx;
  result.grammar = grammar;
  return result;
 }
-// Note: Creates the context if it doesn't exist, so this always return something.
+void llama_sampling_reset(llama_sampling_context * ctx) {
-llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+    if (ctx->grammar != NULL) {
-              llama_sampling_context & ctx_sampling,
+        llama_grammar_free(ctx->grammar);
        const llama_seq_id             seq) {
    const auto it = ctx_sampling.sequence_contexts.find(seq);
    if (it != ctx_sampling.sequence_contexts.end()) {
        return it->second;
    }
-    llama_sampler_sequence_context new_ctx = {
+
-        2.0f * ctx_sampling.params.mirostat_tau,
+    if (!ctx->parsed_grammar.rules.empty()) {
-        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
+        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-    };
+
-    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+        ctx->grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
    }
    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
    ctx->cur.clear();
 }
-bool llama_sampling_context_reset(
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-              llama_sampling_context & ctx_sampling,
+    if (dst->grammar) {
-        const llama_seq_id             seq) {
+        llama_grammar_free(dst->grammar);
-    const auto it = ctx_sampling.sequence_contexts.find(seq);
+        dst->grammar = nullptr;
    if (it == ctx_sampling.sequence_contexts.end()) return false;
    if (it->second.grammar != NULL) {
        llama_grammar_free(it->second.grammar);
        it->second.grammar = NULL;
    }
-    ctx_sampling.sequence_contexts.erase(it);
+
-    return true;
+    if (src->grammar) {
        dst->grammar = llama_grammar_copy(src->grammar);
    }
    dst->prev = src->prev;
 }
 llama_token llama_sampling_last(llama_sampling_context * ctx) {
    return ctx->prev.back();
 }
 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
    const int size = ctx_sampling->prev.size();
    n = std::min(n, size);
    std::string result;
    for (int i = size - n; i < size; i++) {
        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
    }
    return result;
 }
 std::string llama_sampling_print(const llama_sampling_params & params) {
    char result[1024];
    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
            params.mirostat, params.mirostat_eta, params.mirostat_tau);
    return std::string(result);
 }
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
+                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_guidance,
+                  struct llama_context * ctx_main,
-                  struct llama_sampling_context & ctx_sampling,
+                  struct llama_context * ctx_cfg,
-        const std::vector<llama_token> & last_tokens,
+                  const int idx) {
-         std::vector<llama_token_data> & candidates,
+    const llama_sampling_params & params = ctx_sampling->params;
-        const                      int   idx,
+
-                          llama_seq_id   seq) {
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const llama_sampling_params & params = ctx_sampling.params;
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
+    const float   penalty_repeat  = params.penalty_repeat;
-    const float   alpha_presence  = params.presence_penalty;
+    const float   penalty_freq    = params.penalty_freq;
-    const float   alpha_frequency = params.frequency_penalty;
+    const float   penalty_present = params.penalty_present;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
    const bool    penalize_nl     = params.penalize_nl;
    auto & prev = ctx_sampling->prev;
    auto & cur  = ctx_sampling->cur;
    llama_token id = 0;
-    float * logits = llama_get_logits_ith(ctx, idx);
+    float * logits = llama_get_logits_ith(ctx_main, idx);
-    // Apply params.logit_bias map
+    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
-    candidates.clear();
+    cur.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-    if (ctx_guidance) {
+    if (ctx_cfg) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
    }
    // apply penalties
-    if (!last_tokens.empty()) {
+    if (!prev.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
-        llama_sample_repetition_penalty(ctx, &cur_p,
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                prev.data() + prev.size() - penalty_last_n,
-                last_n_repeat, repeat_penalty);
+                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, alpha_frequency, alpha_presence);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
@ -115,52 +163,60 @@ llama_token llama_sampling_sample(
        }
    }
-    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
+    if (ctx_sampling->grammar != NULL) {
-
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    if (ctx_seq.grammar != NULL) {
        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
    }
    if (temp <= 0) {
-        // Greedy sampling
+        // greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
+        id = llama_sample_token_greedy(ctx_main, &cur_p);
    } else {
        if (mirostat == 1) {
            const int mirostat_m = 100;
-            llama_sample_temp(ctx, &cur_p, temp);
+            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
+            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
        } else if (mirostat == 2) {
-            llama_sample_temp(ctx, &cur_p, temp);
+            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
+            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
-            // Temperature sampling
+            // temperature sampling
            size_t min_keep = std::max(1, params.n_probs);
            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
            llama_sample_temp(ctx, &cur_p, temp);
-            {
+            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-                const int n_top = 10;
+            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-                LOG("top %d candidates:\n", n_top);
+            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);
-                for (int i = 0; i < n_top; i++) {
+            id = llama_sample_token(ctx_main, &cur_p);
                    const llama_token id = cur_p.data[i].id;
                    (void)id; // To avoid a warning that id is unused when logging is disabled.
                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                }
            }
-            id = llama_sample_token(ctx, &cur_p);
+            //{
            //    const int n_top = 10;
            //    LOG("top %d candidates:\n", n_top);
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+            //    for (int i = 0; i < n_top; i++) {
            //        const llama_token id = cur_p.data[i].id;
            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
            //    }
            //}
            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
        }
    }
    if (ctx_seq.grammar != NULL) {
        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
    }
    return id;
 }
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        llama_token id,
        bool apply_grammar) {
    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
    ctx_sampling->prev.push_back(id);
    if (ctx_sampling->grammar != NULL && apply_grammar) {
        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
    }
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -2,107 +2,108 @@
 #include "llama.h"
 #include "grammar-parser.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
 // sampling parameters
 typedef struct llama_sampling_params {
    int32_t n_prev            = 64;    // number of previous tokens to remember
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = true;  // consider newlines as a repeatable token
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    std::string grammar;  // optional BNF-like grammar to constrain sampling
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;   // string to help guidance
+    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f;   // How strong is guidance
+    float       cfg_scale     = 1.f; // how strong is guidance
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 } llama_sampling_params;
 // per-sequence sampler context
 typedef struct llama_sampler_sequence_context {
    float mirostat_mu; // mirostat sampler state
    llama_grammar * grammar;
 } llama_sampler_sequence_context;
 // general sampler context
-typedef struct llama_sampling_context {
+// TODO: move to llama.h
-    ~llama_sampling_context();
+struct llama_sampling_context {
-
+    // parameters that will be used for sampling
    // parameters that will be used for sampling and when creating
    // new llama_sampler_sequence_context instances
    llama_sampling_params params;
-    // map of sequence ids to sampler contexts
+    // mirostat sampler state
-    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
+    float mirostat_mu;
    // when non-NULL, new instances of llama_sampler_sequence_context
    // will get a copy of the grammar here
    // note: only the pointer is stored here, it is not a copy of
    //       the grammar and shouldn't be freed
    llama_grammar * grammar;
-} llama_sampling_context;
+
    // internal
    grammar_parser::parse_state parsed_grammar;
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
 };
 #include "common.h"
 // Create a new sampling context instance.
-llama_sampling_context llama_sampling_context_init(
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
        const struct gpt_params & params,
                  llama_grammar * grammar = NULL);
-// Fetches the sampler context for the specified sequence id (defaults to 0).
+void llama_sampling_free(struct llama_sampling_context * ctx);
 // If the context for that sequence id doesn't already exist, it will be created with
 // default values based on the parameters in the ctx_sampling argument.
 llama_sampler_sequence_context & llama_sampling_get_sequence_context(
              llama_sampling_context & ctx_sampling,
        const llama_seq_id             seq = 0);
-// Reset the sampler context for the supplied sequence id (defaults to 0).
+// Reset the sampler context
-// This is necessary to reuse a sequence id or free memory used by sequences
+// - clear prev tokens
-// that are no longer required.
+// - reset grammar
-bool llama_sampling_context_reset(
+void llama_sampling_reset(llama_sampling_context * ctx);
-              llama_sampling_context & ctx_sampling,
+
-        const llama_seq_id             seq = 0);
+// Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 // Get the last sampled token
 llama_token llama_sampling_last(llama_sampling_context * ctx);
 // Get a string representation of the last sampled tokens
 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_context_reset when a sequence ends
+//       llama_sampling_reset when a sequence ends
 //
 // required:
-//  - ctx:          context to use for sampling
+//  - ctx_main:     context to use for sampling
 //  - ctx_sampling: sampling-specific context
 //
 // optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
+//  - ctx_cfg:      context to use for classifier-free guidance
-//  - last_tokens:   needed for repetition penalty, ignore if empty
+//  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //  - seq:           sequence id to associate sampler state with
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
+        struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_guidance,
+        struct llama_context * ctx_main,
-                  struct llama_sampling_context & ctx_sampling,
+        struct llama_context * ctx_cfg,
-        const std::vector<llama_token> & last_tokens,
+        int idx = 0);
-         std::vector<llama_token_data> & candidates,
+
-        const                      int   idx = 0,
+void llama_sampling_accept(
-                          llama_seq_id   seq = 0);
+        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        llama_token id,
        bool apply_grammar);
--- a/common/train.cpp
+++ b/common/train.cpp
@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
    int64_t used_samples = 0;
    ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(lctx);
+    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(lctx);
+    llama_token eos = llama_token_eos(llama_get_model(lctx));
    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
    for (int k=0; k<n_batch; ++k) {
        // printf("%s: batch %d\n", __func__, k);
@ -863,7 +863,7 @@ size_t tokenize_file(
            (int) buf.size(),
            out_tokens.data(),
            (int) out_tokens.size(),
-            false);
+            false, false);
        if (n_tokens < 0) {
            out_tokens.resize(-n_tokens);
            n_tokens = llama_tokenize(
@ -872,7 +872,7 @@ size_t tokenize_file(
                (int) buf.size(),
                out_tokens.data(),
                (int) out_tokens.size(),
-                false);
+                false, false);
        }
        if (n_tokens >= 0) {
            out_tokens.resize(n_tokens);
@ -924,7 +924,7 @@ size_t tokenize_file(
        for (llama_token token=0; token < n_vocab; ++token) {
            max_token_text_size = std::max(
                max_token_text_size,
-                strlen(llama_token_get_text(lctx, token)));
+                strlen(llama_token_get_text(llama_get_model(lctx), token)));
        }
        // upper bound of context byte length.
@ -966,7 +966,7 @@ size_t tokenize_file(
                    (int) buf_sample.size(),
                    tok_sample.data(),
                    (int) tok_sample.size(),
-                    false);
+                    false, false);
                if (n_tokens < 0) {
                    tok_sample.resize(-n_tokens);
                    n_tokens = llama_tokenize(llama_get_model(lctx),
@ -974,7 +974,7 @@ size_t tokenize_file(
                        (int) buf_sample.size(),
                        tok_sample.data(),
                        (int) tok_sample.size(),
-                        false);
+                        false, false);
                    GGML_ASSERT(n_tokens >= 0);
                }
                GGML_ASSERT(n_tokens <= (int) tok_sample.size());
@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
            *sched, opt->loss_after);
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
        help="output format - use 0 for float32, 1 for float16",
    )
    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
    return parser.parse_args()
 args = parse_args()
@ -86,6 +87,11 @@ if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 endianess = gguf.GGUFEndian.LITTLE
 if args.bigendian:
    endianess = gguf.GGUFEndian.BIG
 endianess_str = "Big Endian" if args.bigendian else "Little Endian"
 print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@ -104,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 print("hello print: ",hparams["architectures"][0])
-if hparams["architectures"][0] != "BaichuanForCausalLM":
+if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
 num_parts = count_model_parts(dir_model)
 print(f"num_parts:{num_parts}\n")
 ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 print("gguf: get model metadata")
@ -224,7 +230,7 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model)
+special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@ -118,18 +118,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    if i not in reverse_vocab:
-    scores.append(0.0)  # dummy
+        tokens.append(f"[PAD{i}]")
-    toktypes.append(gguf.TokenType.NORMAL)
+        toktypes.append(gguf.TokenType.USER_DEFINED)
    elif reverse_vocab[i] in added_vocab:
        tokens.append(reverse_vocab[i])
        if tokenizer.added_tokens_decoder[i].special:
            toktypes.append(gguf.TokenType.CONTROL)
        else:
            toktypes.append(gguf.TokenType.USER_DEFINED)
    else:
        tokens.append(reverse_vocab[i])
        toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
-if hparams["architectures"][0] != "FalconForCausalLM":
+if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
-block_count = hparams["num_hidden_layers"]
+block_count = hparams.get("num_hidden_layers")
 if block_count is None:
    block_count = hparams["n_layer"]  # old name
 n_head = hparams.get("num_attention_heads")
 if n_head is None:
    n_head = hparams["n_head"]  # old name
 n_head_kv = hparams.get("num_kv_heads")
 if n_head_kv is None:
    n_head_kv = hparams.get("n_head_kv", 1)  # old name
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_head_count(n_head)
-if "num_kv_heads" in hparams:
+gguf_writer.add_head_count_kv(n_head_kv)
    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
 else:
    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
@ -145,17 +152,13 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # params for qkv transform
 n_head    = hparams["num_attention_heads"]
 n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
 head_dim = hparams["hidden_size"] // n_head
 # tensor info
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -123,18 +123,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    if i not in reverse_vocab:
-    scores.append(0.0) # dummy
+        tokens.append(f"[PAD{i}]")
-    toktypes.append(gguf.TokenType.NORMAL)
+        toktypes.append(gguf.TokenType.USER_DEFINED)
    elif reverse_vocab[i] in added_vocab:
        tokens.append(reverse_vocab[i])
        if tokenizer.added_tokens_decoder[i].special:
            toktypes.append(gguf.TokenType.CONTROL)
        else:
            toktypes.append(gguf.TokenType.USER_DEFINED)
    else:
        tokens.append(reverse_vocab[i])
        toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -388,7 +388,9 @@ def handle_metadata(cfg, hp):
        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
        cfg.vocabtype )
    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
+    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
        load_merges = cfg.vocabtype == 'bpe',
        n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
    return (params, vocab, svocab)
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@ -98,6 +98,8 @@ gguf_writer.add_embedding_length(hparams["d_model"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
 gguf_writer.add_head_count(hparams["n_heads"])
 if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
    gguf_writer.add_head_count_kv(kv_n_heads)
 gguf_writer.add_layer_norm_eps(1e-05)
 if hparams["attn_config"]["clip_qkv"] is not None:
    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
@ -126,18 +128,27 @@ vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    if i not in reverse_vocab:
-    scores.append(0.0) # dummy
+        tokens.append(f"[PAD{i}]")
-    toktypes.append(gguf.TokenType.NORMAL)
+        toktypes.append(gguf.TokenType.USER_DEFINED)
    elif reverse_vocab[i] in added_vocab:
        tokens.append(reverse_vocab[i])
        if tokenizer.added_tokens_decoder[i].special:
            toktypes.append(gguf.TokenType.CONTROL)
        else:
            toktypes.append(gguf.TokenType.USER_DEFINED)
    else:
        tokens.append(reverse_vocab[i])
        toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@ -139,18 +139,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    if i not in reverse_vocab:
-    scores.append(0.0) # dummy
+        tokens.append(f"[PAD{i}]")
-    toktypes.append(gguf.TokenType.NORMAL)
+        toktypes.append(gguf.TokenType.USER_DEFINED)
    elif reverse_vocab[i] in added_vocab:
        tokens.append(reverse_vocab[i])
        if tokenizer.added_tokens_decoder[i].special:
            toktypes.append(gguf.TokenType.CONTROL)
        else:
            toktypes.append(gguf.TokenType.USER_DEFINED)
    else:
        tokens.append(reverse_vocab[i])
        toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@ -111,18 +111,26 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    if i not in reverse_vocab:
-    scores.append(0.0) # dummy
+        tokens.append(f"[PAD{i}]")
-    toktypes.append(gguf.TokenType.NORMAL)
+        toktypes.append(gguf.TokenType.USER_DEFINED)
    elif reverse_vocab[i] in added_vocab:
        tokens.append(reverse_vocab[i])
        if tokenizer.added_tokens_decoder[i].special:
            toktypes.append(gguf.TokenType.CONTROL)
        else:
            toktypes.append(gguf.TokenType.USER_DEFINED)
    else:
        tokens.append(reverse_vocab[i])
        toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
--- a/convert.py
+++ b/convert.py
@ -369,7 +369,7 @@ class SentencePieceVocab:
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+            raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_list = [text for (text, idx) in items]
@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@ -875,10 +875,10 @@ class OutputFile:
        self.gguf.close()
    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
        check_vocab_size(params, vocab)
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
        # meta data
        of.add_meta_arch(params)
@ -903,10 +903,10 @@ class OutputFile:
        return dt.quantize(arr)
    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
        check_vocab_size(params, vocab)
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
        # meta data
        of.add_meta_arch(params)
@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    args = parser.parse_args(args_in)
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
    args = parser.parse_args(args_in)
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
    if args.dump:
        do_dump_model(model_plus)
        return
    endianess = gguf.GGUFEndian.LITTLE
    if args.bigendian:
        endianess = gguf.GGUFEndian.BIG
    params = Params.load(model_plus)
    if params.n_ctx == -1:
@ -1159,10 +1163,13 @@ def main(args_in: list[str] | None = None) -> None:
    vocab: Vocab
    if args.vocab_only:
-        assert args.outfile, "need --outfile if using --vocab-only"
+        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
            load_merges = args.vocabtype == 'bpe',
            n_vocab = vocab.vocab_size)
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
@ -1174,7 +1181,9 @@ def main(args_in: list[str] | None = None) -> None:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
        vocab = load_vocab(vocab_dir, args.vocabtype)
    # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
        load_merges = args.vocabtype == 'bpe',
        n_vocab = vocab.vocab_size)
    model   = model_plus.model
    model   = convert_model_names(model, params)
@ -1185,7 +1194,7 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
    print(f"Wrote {outfile}")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(main)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(finetune)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
    add_subdirectory(speculative)
    add_subdirectory(parallel)
    add_subdirectory(embd-input)
    add_subdirectory(llava)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(finetune)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(train-text-from-scratch)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    llama_batch batch = llama_batch_init(n_kv_max, 0);
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
    // decode in batches of ctx_params.n_batch tokens
    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
@ -123,11 +123,12 @@ int main(int argc, char ** argv) {
            llama_batch batch_view = {
                n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                nullptr,
-                batch.pos    + i,
+                batch.pos      + i,
-                batch.seq_id + i,
+                batch.n_seq_id + i,
-                batch.logits + i,
+                batch.seq_id   + i,
                batch.logits   + i,
                0, 0, 0, // unused
            };
@ -143,13 +144,8 @@ int main(int argc, char ** argv) {
    // warm up
    {
-        batch.n_tokens = 16;
+        for (int i = 0; i < 16; ++i) {
-
+            llama_batch_add(batch, 0, i, { 0 }, false);
        for (int i = 0; i < batch.n_tokens; ++i) {
            batch.token[i]  = 0;
            batch.pos[i]    = i;
            batch.seq_id[i] = 0;
            batch.logits[i] = false;
        }
        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@ -158,6 +154,10 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("\n");
    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
@ -174,13 +174,12 @@ int main(int argc, char ** argv) {
                    continue;
                }
-                batch.n_tokens = is_pp_shared ? pp : pl*pp;
+                llama_batch_clear(batch);
-                for (int i = 0; i < batch.n_tokens; ++i) {
+                const int n_tokens = is_pp_shared ? pp : pl*pp;
-                    batch.token[i]  = 0;
+
-                    batch.pos[i]    = i;
+                for (int i = 0; i < n_tokens; ++i) {
-                    batch.seq_id[i] = 0;
+                    llama_batch_add(batch, 0, i, { 0 }, false);
                    batch.logits[i] = false;
                }
                batch.logits[batch.n_tokens - 1] = true;
@ -204,13 +203,10 @@ int main(int argc, char ** argv) {
                const auto t_tg_start = ggml_time_us();
                for (int i = 0; i < tg; ++i) {
-                    batch.n_tokens = pl;
+                    llama_batch_clear(batch);
                    for (int j = 0; j < pl; ++j) {
-                        batch.token[j]  = 0;
+                        llama_batch_add(batch, 0, pp + i, { j }, true);
                        batch.pos[j]    = pp + i;
                        batch.seq_id[j] = j;
                        batch.logits[j] = true;
                    }
                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -69,7 +69,7 @@ for id: llama_token in tokens {
 print("\n")
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
+var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
 defer {
    llama_batch_free(batch)
 }
@ -80,7 +80,12 @@ batch.n_tokens = Int32(tokens.count)
 for (i, token) in tokens.enumerated() {
    batch.token[i] = token
    batch.pos[i] = Int32(i)
-    batch.seq_id[i] = 0
+    batch.n_seq_id[i] = 1
    // batch.seq_id[i][0] = 0
    // TODO: is this the proper way to do this?
    if let seq_id = batch.seq_id[i] {
        seq_id[0] = 0
    }
    batch.logits[i] = 0
 }
@ -169,7 +174,10 @@ while n_cur <= n_len {
        // push this new token for next evaluation
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
            seq_id[0] = Int32(i)
        }
        batch.logits[Int(batch.n_tokens)] = 1
        i_batch[i] = batch.n_tokens
@ -209,7 +217,7 @@ llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -11,12 +11,19 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
        return 1 ;
    }
    // number of parallel batches
    int n_parallel = 1;
    // total length of the sequences including the prompt
    int n_len = 32;
    // number of layers to offload to the GPU
    int n_gpu_layers = 0;
    if (argc >= 2) {
        params.model = argv[1];
    }
@ -29,13 +36,18 @@ int main(int argc, char ** argv) {
        n_parallel = std::atoi(argv[3]);
    }
    if (argc >= 5) {
        n_len = std::atoi(argv[4]);
    }
    if (argc >= 6) {
        n_gpu_layers = std::atoi(argv[5]);
    }
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
    // total length of the sequences including the prompt
    const int n_len = 32;
    // init LLM
    llama_backend_init(params.numa);
@ -44,7 +56,7 @@ int main(int argc, char ** argv) {
    llama_model_params model_params = llama_model_default_params();
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    model_params.n_gpu_layers = n_gpu_layers;
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -97,20 +109,15 @@ int main(int argc, char ** argv) {
    fflush(stderr);
-    // create a llama_batch with size 512
+    // create a llama_batch
    // we use this object to submit token data for decoding
-
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
    // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
+    for (size_t i = 0; i < tokens_list.size(); ++i) {
-
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
    for (int32_t i = 0; i < batch.n_tokens; i++) {
        batch.token[i]  = tokens_list[i];
        batch.pos[i]    = i;
        batch.seq_id[i] = 0;
        batch.logits[i] = false;
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;
@ -146,7 +153,7 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // prepare the next batch
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
        // sample the next token for each parallel sequence / stream
        for (int32_t i = 0; i < n_parallel; ++i) {
@ -180,7 +187,7 @@ int main(int argc, char ** argv) {
            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
@ -198,15 +205,10 @@ int main(int argc, char ** argv) {
            streams[i] += llama_token_to_piece(ctx, new_token_id);
            // push this new token for next evaluation
            batch.token [batch.n_tokens] = new_token_id;
            batch.pos   [batch.n_tokens] = n_cur;
            batch.seq_id[batch.n_tokens] = i;
            batch.logits[batch.n_tokens] = true;
            i_batch[i] = batch.n_tokens;
-            batch.n_tokens += 1;
+            // push this new token for next evaluation
            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
            n_decode += 1;
        }
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
 }
 // Function matching type llama_beam_search_callback_fn_t.
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
    if (file.size < 4) {
        return false;
    }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
    return magic == GGUF_MAGIC;
 }
--- a/examples/embd-input/.gitignore
+++ b/examples/embd-input/.gitignore
@ -1,4 +0,0 @@
 PandaGPT
 MiniGPT-4
 *.pth
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@ -1,17 +0,0 @@
 set(TARGET embdinput)
 add_library(${TARGET} embd-input-lib.cpp embd-input.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
 set(TARGET embd-input-test)
 add_executable(${TARGET} embd-input-test.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@ -1,63 +0,0 @@
 ### Examples for input embedding directly
 ## Requirement
 build  `libembdinput.so`
 run the following comman in main dir (../../).
 ```
 make
 ```
 ## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
 1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
 2. Convert it to ggml format.
 3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
 ```
 import torch
 bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
 pth_path = "./examples/embd-input/llava_projection.pth"
 dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
 torch.save({k: dic[k] for k in used_key}, pth_path)
 ```
 4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
 ## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
 1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
 The `adapter_config.json` is
 ```
 {
  "peft_type": "LORA",
  "fan_in_fan_out": false,
  "bias": null,
  "modules_to_save": null,
  "r": 32,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
 }
 ```
 2. Papare the `vicuna` v0 model.
 3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
 4. Clone the PandaGPT source.
 ```
 git clone https://github.com/yxuansu/PandaGPT
 ```
 5. Install the requirement of PandaGPT.
 6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
 ## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
 1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
 2. Clone the MiniGPT-4 source.
 ```
 git clone https://github.com/Vision-CAIR/MiniGPT-4/
 ```
 3. Install the requirement of PandaGPT.
 4. Papare the `vicuna` v0 model.
 5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -1,221 +0,0 @@
 #include "build-info.h"
 #include "common.h"
 #include "embd-input.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 static llama_context ** g_ctx;
 extern "C" {
 struct MyModel* create_mymodel(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        return nullptr;
    }
    print_build_info();
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = uint32_t(time(NULL));
    }
    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    g_ctx = &ctx;
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return nullptr;
    }
    // print system information
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
    struct MyModel * ret = new MyModel();
    ret->ctx = ctx;
    ret->params = params;
    ret->n_past = 0;
    // printf("ctx: %d\n", ret->ctx);
    return ret;
 }
 void free_mymodel(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    llama_print_timings(ctx);
    llama_free(ctx);
    delete mymodel;
 }
 bool eval_float(void * model, float * input, int N){
    MyModel * mymodel = (MyModel*)model;
    llama_context * ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    int n_emb = llama_n_embd(llama_get_model(ctx));
    int n_past = mymodel->n_past;
    int n_batch = N; // params.n_batch;
    for (int i = 0; i < (int) N; i += n_batch) {
        int n_eval = (int) N - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_eval;
    }
    mymodel->n_past = n_past;
    return true;
 }
 bool eval_tokens(void * model, std::vector<llama_token> tokens) {
    MyModel * mymodel = (MyModel* )model;
    llama_context * ctx;
    ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    int n_past = mymodel->n_past;
    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > params.n_batch) {
            n_eval = params.n_batch;
        }
        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_eval;
    }
    mymodel->n_past = n_past;
    return true;
 }
 bool eval_id(struct MyModel* mymodel, int id) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(mymodel, tokens);
 }
 bool eval_string(struct MyModel * mymodel,const char* str){
    llama_context * ctx = mymodel->ctx;
    std::string str2 = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
    eval_tokens(mymodel, embd_inp);
    return true;
 }
 llama_token sampling_id(struct MyModel* mymodel) {
    llama_context* ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    llama_sampling_params & sparams = params.sampling_params;
    // int n_ctx = llama_n_ctx(ctx);
    // out of user input, sample next token
    const float   temp            = sparams.temp;
    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
    const float   top_p           = sparams.top_p;
    const float   tfs_z           = sparams.tfs_z;
    const float   typical_p       = sparams.typical_p;
    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    // const float   repeat_penalty  = params.repeat_penalty;
    // const float   alpha_presence  = params.presence_penalty;
    // const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = sparams.mirostat;
    const float   mirostat_tau    = sparams.mirostat_tau;
    const float   mirostat_eta    = sparams.mirostat_eta;
    // const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
        // Apply params.logit_bias map
        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        // TODO: Apply penalties
        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        //      last_n_repeat, repeat_penalty);
        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }
        if (temp <= 0) {
            // Greedy sampling
            id = llama_sample_token_greedy(ctx, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const int mirostat_m = 100;
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                // Temperature sampling
                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token(ctx, &candidates_p);
            }
        }
    }
    return id;
 }
 const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx, id);
    }
    eval_id(mymodel, id);
    return ret.c_str();
 }
 }
--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@ -1,35 +0,0 @@
 #include "embd-input.h"
 #include <stdlib.h>
 #include <random>
 #include <string.h>
 int main(int argc, char** argv) {
    auto mymodel = create_mymodel(argc, argv);
    int N = 10;
    int max_tgt_len = 500;
    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
    // add random float embd to test evaluation
    float * data = new float[N*n_embd];
    std::default_random_engine e;
    std::uniform_real_distribution<float>  u(0,1);
    for (int i=0;i<N*n_embd;i++) {
        data[i] = u(e);
    }
    eval_string(mymodel, "user: what is the color of the flag of UN?");
    eval_float(mymodel, data, N);
    eval_string(mymodel, "assistant:");
    eval_string(mymodel, mymodel->params.prompt.c_str());
    const char* tmp;
    for (int i=0; i<max_tgt_len; i++) {
        tmp = sampling(mymodel);
        if (strcmp(tmp, "</s>")==0) break;
        printf("%s", tmp);
        fflush(stdout);
    }
    printf("\n");
    free_mymodel(mymodel);
    return 0;
 }
--- a/examples/embd-input/embd-input.h
+++ b/examples/embd-input/embd-input.h
@ -1,27 +0,0 @@
 #ifndef _EMBD_INPUT_H_
 #define _EMBD_INPUT_H_ 1
 #include "common.h"
 #include "llama.h"
 extern "C" {
 typedef struct MyModel {
    llama_context* ctx;
    gpt_params params;
    int n_past = 0;
 } MyModel;
 struct MyModel* create_mymodel(int argc, char ** argv);
 bool eval_float(void* model, float* input, int N);
 bool eval_tokens(void* model, std::vector<llama_token> tokens);
 bool eval_id(struct MyModel* mymodel, int id);
 bool eval_string(struct MyModel* mymodel, const char* str);
 const char * sampling(struct MyModel* mymodel);
 llama_token sampling_id(struct MyModel* mymodel);
 void free_mymodel(struct MyModel* mymodel);
 }
 #endif
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@ -1,72 +0,0 @@
 #!/usr/bin/env python3
 import ctypes
 from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
 import numpy as np
 import os
 libc = cdll.LoadLibrary("./libembdinput.so")
 libc.sampling.restype=c_char_p
 libc.create_mymodel.restype=c_void_p
 libc.eval_string.argtypes=[c_void_p, c_char_p]
 libc.sampling.argtypes=[c_void_p]
 libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
 class MyModel:
    def __init__(self, args):
        argc = len(args)
        c_str = [c_char_p(i.encode()) for i in args]
        args_c = (c_char_p * argc)(*c_str)
        self.model = c_void_p(libc.create_mymodel(argc, args_c))
        self.max_tgt_len = 512
        self.print_string_eval = True
    def __del__(self):
        libc.free_mymodel(self.model)
    def eval_float(self, x):
        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
    def eval_string(self, x):
        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
        if self.print_string_eval:
            print(x)
    def eval_token(self, x):
        libc.eval_id(self.model, x)
    def sampling(self):
        s = libc.sampling(self.model)
        return s
    def stream_generate(self, end="</s>"):
        ret = b""
        end = end.encode()
        for _ in range(self.max_tgt_len):
            tmp = self.sampling()
            ret += tmp
            yield tmp
            if ret.endswith(end):
                break
    def generate_with_print(self, end="</s>"):
        ret = b""
        for i in self.stream_generate(end=end):
            ret += i
            print(i.decode(errors="replace"), end="", flush=True)
        print("")
        return ret.decode(errors="replace")
    def generate(self, end="</s>"):
        text = b"".join(self.stream_generate(end=end))
        return text.decode(errors="replace")
 if __name__ == "__main__":
    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
    model.eval_string("""user: what is the color of the flag of UN?""")
    x = np.random.random((5120,10))# , dtype=np.float32)
    model.eval_float(x)
    model.eval_string("""assistant:""")
    for i in model.generate():
        print(i.decode(errors="replace"), end="", flush=True)
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@ -1,71 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 from transformers import CLIPVisionModel,  CLIPImageProcessor
 from PIL import Image
 # model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
 vision_tower = "openai/clip-vit-large-patch14"
 select_hidden_state_layer = -2
 # (vision_config.image_size // vision_config.patch_size) ** 2
 image_token_len = (224//14)**2
 class Llava:
    def __init__(self, args):
        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
        self.mm_projector = nn.Linear(1024, 5120)
        self.model = MyModel(["main", *args])
    def load_projection(self, path):
        state = torch.load(path)
        self.mm_projector.load_state_dict({
            "weight": state["model.mm_projector.weight"],
            "bias": state["model.mm_projector.bias"]})
    def chat(self, question):
        self.model.eval_string("user: ")
        self.model.eval_string(question)
        self.model.eval_string("\nassistant: ")
        return self.model.generate_with_print()
    def chat_with_image(self, image, question):
        with torch.no_grad():
            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
            image_feature = select_hidden_state[:, 1:]
            embd_image = self.mm_projector(image_feature)
            embd_image = embd_image.cpu().numpy()[0]
        self.model.eval_string("user: ")
        self.model.eval_token(32003-2) # im_start
        self.model.eval_float(embd_image.T)
        for i in range(image_token_len-embd_image.shape[0]):
            self.model.eval_token(32003-3) # im_patch
        self.model.eval_token(32003-1) # im_end
        self.model.eval_string(question)
        self.model.eval_string("\nassistant: ")
        return self.model.generate_with_print()
 if __name__=="__main__":
    # model form liuhaotian/LLaVA-13b-delta-v1-1
    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
    # Also here can use pytorch_model-00003-of-00003.bin directly.
    a.load_projection(os.path.join(
        os.path.dirname(__file__) ,
        "llava_projection.pth"))
    respose = a.chat_with_image(
        Image.open("./media/llama1-logo.png").convert('RGB'),
        "what is the text in the picture?")
    respose
    a.chat("what is the color of it?")
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@ -1,129 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 from PIL import Image
 minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
 sys.path.insert(0, minigpt4_path)
 from minigpt4.models.blip2 import Blip2Base
 from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
 class MiniGPT4(Blip2Base):
    """
    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
    """
    def __init__(self,
        args,
        vit_model="eva_clip_g",
        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
        img_size=224,
        drop_path_rate=0,
        use_grad_checkpoint=False,
        vit_precision="fp32",
        freeze_vit=True,
        freeze_qformer=True,
        num_query_token=32,
        llama_model="",
        prompt_path="",
        prompt_template="",
        max_txt_len=32,
        end_sym='\n',
        low_resource=False,  # use 8 bit and put vit in cpu
        device_8bit=0
    ):
        super().__init__()
        self.img_size = img_size
        self.low_resource = low_resource
        self.preprocessor = Blip2ImageEvalProcessor(img_size)
        print('Loading VIT')
        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
        )
        print('Loading VIT Done')
        print('Loading Q-Former')
        self.Qformer, self.query_tokens = self.init_Qformer(
            num_query_token, self.visual_encoder.num_features
        )
        self.Qformer.cls = None
        self.Qformer.bert.embeddings.word_embeddings = None
        self.Qformer.bert.embeddings.position_embeddings = None
        for layer in self.Qformer.bert.encoder.layer:
            layer.output = None
            layer.intermediate = None
        self.load_from_pretrained(url_or_filename=q_former_model)
        print('Loading Q-Former Done')
        self.llama_proj = nn.Linear(
            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
        )
        self.max_txt_len = max_txt_len
        self.end_sym = end_sym
        self.model = MyModel(["main", *args])
        # system prompt
        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
           "You will be able to see the image once I provide it to you. Please answer my questions."
           "###")
    def encode_img(self, image):
        image = self.preprocessor(image)
        image = image.unsqueeze(0)
        device = image.device
        if self.low_resource:
            self.vit_to_cpu()
            image = image.to("cpu")
        with self.maybe_autocast():
            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
            query_output = self.Qformer.bert(
                query_embeds=query_tokens,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=True,
            )
            inputs_llama = self.llama_proj(query_output.last_hidden_state)
            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
        return inputs_llama
    def load_projection(self, path):
        state = torch.load(path)["model"]
        self.llama_proj.load_state_dict({
            "weight": state["llama_proj.weight"],
            "bias": state["llama_proj.bias"]})
    def chat(self, question):
        self.model.eval_string("Human: ")
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        return self.model.generate_with_print(end="###")
    def chat_with_image(self, image, question):
        with torch.no_grad():
            embd_image = self.encode_img(image)
        embd_image = embd_image.cpu().numpy()[0]
        self.model.eval_string("Human: <Img>")
        self.model.eval_float(embd_image.T)
        self.model.eval_string("</Img> ")
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        return self.model.generate_with_print(end="###")
 if __name__=="__main__":
    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
    a.load_projection(os.path.join(
        os.path.dirname(__file__) ,
        "pretrained_minigpt4.pth"))
    respose = a.chat_with_image(
        Image.open("./media/llama1-logo.png").convert('RGB'),
        "what is the text in the picture?")
    a.chat("what is the color of it?")
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@ -1,99 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 # use PandaGPT path
 panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
 imagebind_ckpt_path = "./models/panda_gpt/"
 sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
 from ImageBind.models import imagebind_model
 from ImageBind import data
 ModalityType = imagebind_model.ModalityType
 max_tgt_len = 400
 class PandaGPT:
    def __init__(self, args):
        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
        self.visual_encoder.eval()
        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
        self.max_tgt_len = max_tgt_len
        self.model = MyModel(["main", *args])
        self.generated_text = ""
        self.device = "cpu"
    def load_projection(self, path):
        state = torch.load(path, map_location="cpu")
        self.llama_proj.load_state_dict({
            "weight": state["llama_proj.weight"],
            "bias": state["llama_proj.bias"]})
    def eval_inputs(self, inputs):
        self.model.eval_string("<Img>")
        embds = self.extract_multimoal_feature(inputs)
        for i in embds:
            self.model.eval_float(i.T)
        self.model.eval_string("</Img> ")
    def chat(self, question):
        return self.chat_with_image(None, question)
    def chat_with_image(self, inputs, question):
        if self.generated_text == "":
            self.model.eval_string("###")
        self.model.eval_string(" Human: ")
        if inputs:
            self.eval_inputs(inputs)
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        ret = self.model.generate_with_print(end="###")
        self.generated_text += ret
        return ret
    def extract_multimoal_feature(self, inputs):
        features = []
        for key in ["image", "audio", "video", "thermal"]:
            if key + "_paths" in inputs:
                embeds = self.encode_data(key, inputs[key+"_paths"])
                features.append(embeds)
        return features
    def encode_data(self, data_type, data_paths):
        type_map = {
            "image": ModalityType.VISION,
            "audio": ModalityType.AUDIO,
            "video": ModalityType.VISION,
            "thermal": ModalityType.THERMAL,
        }
        load_map = {
            "image": data.load_and_transform_vision_data,
            "audio": data.load_and_transform_audio_data,
            "video": data.load_and_transform_video_data,
            "thermal": data.load_and_transform_thermal_data
        }
        load_function = load_map[data_type]
        key = type_map[data_type]
        inputs = {key: load_function(data_paths, self.device)}
        with torch.no_grad():
            embeddings = self.visual_encoder(inputs)
            embeds = embeddings[key]
            embeds = self.llama_proj(embeds).cpu().numpy()
        return embeds
 if __name__=="__main__":
    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
    a.load_projection("./models/panda_gpt/adapter_model.bin")
    a.chat_with_image(
        {"image_paths": ["./media/llama1-logo.png"]},
        "what is the text in the picture? 'llama' or 'lambda'?")
    a.chat("what is the color of it?")
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -39,8 +39,8 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 static bool is_interacting = false;
 static void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
    gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
    g_params = &params;
    if (!gpt_params_parse(argc, argv, params)) {
@ -246,23 +246,23 @@ int main(int argc, char ** argv) {
    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
        inp_sfx.erase(inp_sfx.begin());
    }
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
    }
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(ctx));
+    embd_inp.push_back(llama_token_middle(model));
    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
+        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
    // Tokenize negative prompt
@ -273,10 +273,10 @@ int main(int argc, char ** argv) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@ -294,8 +294,8 @@ int main(int argc, char ** argv) {
        params.n_keep = (int)embd_inp.size();
    }
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
    // enable interactive mode if interactive start is specified
@ -358,39 +358,10 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
        LOG_TEE("%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
        LOG_TEE("\n");
        {
            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
        grammar = llama_grammar_init(
            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
    // TODO: replace with ring-buffer
    std::vector<llama_token> last_tokens(n_ctx);
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.infill) {
        printf("\n************\n");
@ -433,11 +404,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
-    const int n_vocab = llama_n_vocab(model);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    while (n_remain != 0 || params.interactive) {
        // predict
@ -484,7 +451,7 @@ int main(int argc, char ** argv) {
                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
            }
@ -512,7 +479,7 @@ int main(int argc, char ** argv) {
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
@ -535,7 +502,7 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
@ -554,12 +521,11 @@ int main(int argc, char ** argv) {
        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-            last_tokens.erase(last_tokens.begin());
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
            last_tokens.push_back(id);
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
            embd.push_back(id);
@ -575,8 +541,11 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
+
-                last_tokens.push_back(embd_inp[n_consumed]);
+                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -608,10 +577,10 @@ int main(int argc, char ** argv) {
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if(is_interacting && !params.interactive_first) {
                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
                fflush(stdout);
                printf("\n");
@ -625,7 +594,7 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line, if so we use the old input
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_prefix = buffer;
                }
                buffer.clear();
@ -635,7 +604,7 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_suffix = buffer;
                }
                buffer.clear();
@ -648,7 +617,7 @@ int main(int argc, char ** argv) {
                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
@ -658,14 +627,14 @@ int main(int argc, char ** argv) {
                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                    inp_sfx.erase(inp_sfx.begin());
                }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
                }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(ctx));
+                embd_inp.push_back(llama_token_middle(model));
                embd.clear();
                embd_guidance.clear();
                n_remain = params.n_predict;
@ -675,7 +644,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of text token in interactive mode
-            else if (last_tokens.back() == llama_token_eos(ctx)) {
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                LOG("found EOS token\n");
                if (params.interactive) {
@ -692,7 +661,7 @@ int main(int argc, char ** argv) {
                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                }
                std::string buffer;
@ -727,7 +696,7 @@ int main(int argc, char ** argv) {
                    const size_t original_size = embd_inp.size();
                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@ -748,22 +717,14 @@ int main(int argc, char ** argv) {
            if (n_past > 0) {
                if (is_interacting) {
-                    // reset grammar state if we're restarting generation
+                    llama_sampling_reset(ctx_sampling);
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
                    }
                }
                is_interacting = false;
            }
        }
        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
            break;
        }
@ -775,7 +736,7 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
        fflush(stdout);
    }
@ -786,9 +747,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);
-    if (grammar != NULL) {
+    llama_sampling_free(ctx_sampling);
        llama_grammar_free(grammar);
    }
    llama_backend_free();
 #ifndef LOG_DISABLE_LOGS
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -933,7 +933,7 @@ struct sql_printer : public printer {
 };
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
    int n_processed = 0;
    llama_set_n_threads(ctx, n_threads, n_threads);
@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 }
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(ctx);
+    llama_token token = llama_token_bos(llama_get_model(ctx));
    llama_set_n_threads(ctx, n_threads, n_threads);
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -1,7 +1,7 @@
 set(TARGET clip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -112,8 +112,7 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
 static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if (!cur) {
-        printf("unable to find tensor %s\n", name.c_str());
+        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
        throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
    }
    return cur;
@ -136,7 +135,7 @@ static std::string get_ftype(int ftype) {
    case 8:
        return "q8_0";
    default:
-        throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
+        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
    }
 }
@ -462,6 +461,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    };
    struct gguf_context * ctx = gguf_init_from_file(fname, params);
    if (!ctx) {
        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
    }
    if (verbosity >= 1) {
        const int n_tensors = gguf_get_n_tensors(ctx);
@ -608,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
        for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
+            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
        }
        if (verbosity >= 2) {
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -16,13 +16,29 @@ checkpoint = torch.load(path)
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
 # store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name] for name in mm_tensors}
+projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
    del checkpoint[name]
 # BakLLaVA models contain CLIP tensors in it
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
 if len(clip_tensors) > 0:
    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
    torch.save(clip, f"{args.model}/llava.clip")
    # remove these tensors
    for name in clip_tensors:
        del checkpoint[name]
    # added tokens should be removed to be able to convert Mistral models
    if os.path.exists(f"{args.model}/added_tokens.json"):
        with open(f"{args.model}/added_tokens.json", "w") as f:
            f.write("{}\n")
 torch.save(checkpoint, path)
 print("Done!")
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@ -17,7 +17,7 @@ inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
@ -49,37 +49,39 @@ inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
-inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
 // TODO: use common/sampling.h
 inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-      // out of user input, sample next token
+    auto & sparams = params.sparams;
-    const float   temp      = params.sampling_params.temp;
+
-    const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
+    // out of user input, sample next token
-    const float   top_p     = params.sampling_params.top_p;
+    const float   temp      = sparams.temp;
-    const float   tfs_z     = params.sampling_params.tfs_z;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
-    const float   typical_p = params.sampling_params.typical_p;
+    const float   top_p     = sparams.top_p;
-      // const int32_t repeat_last_n   = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
+    const float   tfs_z     = sparams.tfs_z;
-      // const float   repeat_penalty  = params.sampling_params.repeat_penalty;
+    const float   typical_p = sparams.typical_p;
-      // const float   alpha_presence  = params.sampling_params.presence_penalty;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
-      // const float   alpha_frequency = params.sampling_params.frequency_penalty;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
-    const int     mirostat     = params.sampling_params.mirostat;
+    // const float   alpha_presence  = sparams.presence_penalty;
-    const float   mirostat_tau = params.sampling_params.mirostat_tau;
+    // const float   alpha_frequency = sparams.frequency_penalty;
-    const float   mirostat_eta = params.sampling_params.mirostat_eta;
+    const int     mirostat     = sparams.mirostat;
-      // const bool    penalize_nl     = params.sampling_params.penalize_nl;
+    const float   mirostat_tau = sparams.mirostat_tau;
    const float   mirostat_eta = sparams.mirostat_eta;
    // const bool    penalize_nl     = sparams.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
-          // Apply params.logit_bias map
+        // Apply params.logit_bias map
-        for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-          // TODO: Apply penalties
+        // TODO: Apply penalties
-          // float nl_logit = logits[llama_token_nl(ctx)];
+        // float nl_logit = logits[llama_token_nl(ctx)];
-          // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-          // llama_sample_repetition_penalty(ctx, &candidates_p,
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
-          //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          //      last_n_repeat, repeat_penalty);
+        //      last_n_repeat, repeat_penalty);
-          // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-          // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          // last_n_repeat, alpha_frequency, alpha_presence);
+        // last_n_repeat, alpha_frequency, alpha_presence);
-          // if (!penalize_nl) {
+        // if (!penalize_nl) {
-          //     logits[llama_token_nl(ctx)] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
-          // }
+        // }
        if (temp <= 0) {
              // Greedy sampling
@ -135,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
 inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
-    if (id == llama_token_eos(ctx_llama)) {
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -79,7 +79,13 @@ int main(int argc, char ** argv) {
    llama_backend_init(params.numa);
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params              = llama_model_default_params();
                       model_params.n_gpu_layers = params.n_gpu_layers;
                       model_params.main_gpu     = params.main_gpu;
                       model_params.tensor_split = params.tensor_split;
                       model_params.use_mmap     = params.use_mmap;
                       model_params.use_mlock    = params.use_mlock;
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -91,6 +97,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    ctx_params.seed            = params.seed;
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
@ -100,7 +107,8 @@ int main(int argc, char ** argv) {
    }
    // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    if (n_img_embd != n_llama_embd) {
        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
@ -119,14 +127,14 @@ int main(int argc, char ** argv) {
    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-    // GG: are we sure that the should be a trailing whitespace at the end of this string?
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
-    eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
    eval_string(ctx_llama, "\nASSISTANT:",        params.n_batch, &n_past);
    // generate the response
    printf("\n");
    printf("prompt: '%s'\n", params.prompt.c_str());
    printf("\n");
    for (int i = 0; i < max_tgt_len; i++) {
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -16,6 +16,8 @@ add_library(common OBJECT
    ${_common_path}/console.cpp
    ${_common_path}/grammar-parser.h
    ${_common_path}/grammar-parser.cpp
    ${_common_path}/sampling.h
    ${_common_path}/sampling.cpp
    )
 # WARNING: because build-info.h is auto-generated, it will only
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -3,7 +3,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include <cassert>
 #include <cinttypes>
@ -109,7 +108,7 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@ -238,19 +237,19 @@ int main(int argc, char ** argv) {
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
    }
    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
+        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
    // Tokenize negative prompt
@ -260,11 +259,11 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@ -320,11 +319,11 @@ int main(int argc, char ** argv) {
    }
    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
@ -383,6 +382,12 @@ int main(int argc, char ** argv) {
        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
                if (params.verbose_prompt) {
                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                    for (int i = 0; i < (int) tmp.size(); i++) {
                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                    }
                }
            }
        }
@ -392,45 +397,27 @@ int main(int argc, char ** argv) {
        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
+            if (params.verbose_prompt) {
-    }
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+                for (int i = 0; i < (int) tmp.size(); i++) {
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+                }
    LOG_TEE("\n\n");
    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
        LOG_TEE("%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
        LOG_TEE("\n");
        {
            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
        grammar = llama_grammar_init(
            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
-
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    // TODO: replace with ring-buffer
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    std::vector<llama_token> last_tokens(n_ctx);
+    LOG_TEE("\n\n");
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    if (params.interactive) {
        const char *control_message;
@ -471,11 +458,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
-    const int n_vocab = llama_n_vocab(model);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@ -522,7 +505,7 @@ int main(int argc, char ** argv) {
                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
                LOG("clear session path\n");
                path_session.clear();
@ -552,7 +535,6 @@ int main(int argc, char ** argv) {
            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
            if (ctx_guidance) {
                int input_size = 0;
                llama_token * input_buf = NULL;
@ -574,7 +556,7 @@ int main(int argc, char ** argv) {
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
@ -597,7 +579,7 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
@ -627,12 +609,11 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-            last_tokens.erase(last_tokens.begin());
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
            last_tokens.push_back(id);
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
            embd.push_back(id);
@ -648,8 +629,11 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
+
-                last_tokens.push_back(embd_inp[n_consumed]);
+                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -679,12 +663,10 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
-                std::string last_output;
+                const int n_prev = 32;
-                for (auto id : last_tokens) {
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
                    last_output += llama_token_to_piece(ctx, id);
                }
                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@ -711,13 +693,13 @@ int main(int argc, char ** argv) {
            }
            // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                LOG("found EOS token\n");
                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }
@ -738,14 +720,13 @@ int main(int argc, char ** argv) {
                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                }
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
+                    printf("%s", params.input_prefix.c_str());
                    printf("%s", buffer.c_str());
                }
                // color user input only
@ -767,7 +748,6 @@ int main(int argc, char ** argv) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }
@ -781,11 +761,18 @@ int main(int argc, char ** argv) {
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
                    if (params.escape) {
                        process_escapes(buffer);
                    }
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
                    // instruct mode: insert response suffix
                    if (params.instruct) {
@ -810,22 +797,14 @@ int main(int argc, char ** argv) {
            if (n_past > 0) {
                if (is_interacting) {
-                    // reset grammar state if we're restarting generation
+                    llama_sampling_reset(ctx_sampling);
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
                    }
                }
                is_interacting = false;
            }
        }
        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
@ -850,9 +829,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);
-    if (grammar != NULL) {
+    llama_sampling_free(ctx_sampling);
        llama_grammar_free(grammar);
    }
    llama_backend_free();
 #ifndef LOG_DISABLE_LOGS
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -51,6 +51,12 @@ static std::vector<std::string> k_prompts = {
 };
 struct client {
    ~client() {
        if (ctx_sampling) {
            llama_sampling_free(ctx_sampling);
        }
    }
    int32_t id = 0;
    llama_seq_id seq_id = -1;
@ -68,7 +74,7 @@ struct client {
    std::string prompt;
    std::string response;
-    std::vector<llama_token> tokens_prev;
+    struct llama_sampling_context * ctx_sampling = nullptr;
 };
 static void print_date_time() {
@ -125,8 +131,6 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@ -147,20 +151,15 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "\n\n");
    fflush(stderr);
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_ctx = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(model);
    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.tokens_prev.resize(std::max(256, params.n_predict));
+        client.ctx_sampling = llama_sampling_init(params.sparams);
        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
    }
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    std::vector<llama_token> tokens_system;
    tokens_system = ::llama_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();
@ -169,7 +168,7 @@ int main(int argc, char ** argv) {
    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
@ -184,13 +183,8 @@ int main(int argc, char ** argv) {
    {
        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
-        batch.n_tokens = n_tokens_system;
+        for (int32_t i = 0; i < n_tokens_system; ++i) {
-
+            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
        for (int32_t i = 0; i < batch.n_tokens; ++i) {
            batch.token[i]  = tokens_system[i];
            batch.pos[i]    = i;
            batch.seq_id[i] = 0;
            batch.logits[i] = false;
        }
        if (llama_decode(ctx, batch) != 0) {
@ -209,7 +203,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("Processing requests ...\n\n");
    while (true) {
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
        // decode any currently ongoing sequences
        for (auto & client : clients) {
@ -217,15 +211,11 @@ int main(int argc, char ** argv) {
                continue;
            }
            batch.token [batch.n_tokens] = client.sampled;
            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
            batch.seq_id[batch.n_tokens] = client.id;
            batch.logits[batch.n_tokens] = true;
            client.n_decoded += 1;
            client.i_batch = batch.n_tokens;
-            batch.n_tokens += 1;
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
            client.n_decoded += 1;
        }
        if (batch.n_tokens == 0) {
@ -250,18 +240,14 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";
-                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+                    llama_sampling_reset(client.ctx_sampling);
                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        batch.token [batch.n_tokens] = tokens_prompt[i];
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
                        batch.seq_id[batch.n_tokens] = client.id;
                        batch.logits[batch.n_tokens] = false;
                        batch.n_tokens += 1;
                    }
                    // extract the logits only for the last token
@ -304,11 +290,12 @@ int main(int argc, char ** argv) {
            llama_batch batch_view = {
                n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                nullptr,
-                batch.pos    + i,
+                batch.pos      + i,
-                batch.seq_id + i,
+                batch.n_seq_id + i,
-                batch.logits + i,
+                batch.seq_id   + i,
                batch.logits   + i,
                0, 0, 0, // unused
            };
@ -341,7 +328,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
-                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
+                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@ -349,11 +338,8 @@ int main(int argc, char ** argv) {
                    client.t_start_gen = ggml_time_us();
                }
                // remember which tokens were sampled - used for repetition penalties during sampling
                client.tokens_prev.erase(client.tokens_prev.begin());
                client.tokens_prev.push_back(id);
                const std::string token_str = llama_token_to_piece(ctx, id);
                client.response += token_str;
                client.sampled = id;
@ -361,7 +347,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
                if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(ctx) ||
+                        (id == llama_token_eos(model) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
@ -386,7 +372,7 @@ int main(int argc, char ** argv) {
                    n_total_prompt += client.n_prompt;
                    n_total_gen    += client.n_decoded;
-                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
+
                    client.seq_id = -1;
                }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }
            const auto batch_logits = llama_get_logits(ctx);
@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -8,10 +8,7 @@
 int main(int argc, char ** argv) {
    gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
+
    params.seed = 42;
    params.n_threads = 4;
    sparams.repeat_last_n = 64;
    params.prompt = "The quick brown fox";
    if (!gpt_params_parse(argc, argv, params)) {
@ -25,56 +22,49 @@ int main(int argc, char ** argv) {
    }
    auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
+
    std::string result0;
    std::string result1;
    // init
    llama_model * model;
    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == nullptr) {
+    if (model == nullptr || ctx == nullptr) {
-        return 1;
+        fprintf(stderr, "%s : failed to init\n", __func__);
    }
    if (ctx == nullptr) {
        llama_free_model(model);
        return 1;
    }
    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);
    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
        llama_free_model(model);
        return 1;
    }
    // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
    n_past += tokens.size();
-    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    // save state (rng, logits, embedding and kv_cache) to file
    n_past += n_prompt_tokens;
    const size_t state_size = llama_get_state_size(ctx);
    uint8_t * state_mem = new uint8_t[state_size];
    // Save state (rng, logits, embedding and kv_cache) to file
    {
-        FILE *fp_write = fopen("dump_state.bin", "wb");
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+
-        fwrite(state_mem, 1, state_size, fp_write);
+        {
-        fclose(fp_write);
+            FILE *fp_write = fopen("dump_state.bin", "wb");
            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
            fclose(fp_write);
        }
    }
    // save state (last tokens)
    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
    const auto n_past_saved = n_past;
    // first run
-    printf("\n%s", params.prompt.c_str());
+    printf("\nfirst run: %s", params.prompt.c_str());
    for (auto i = 0; i < params.n_predict; i++) {
        auto * logits = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(model);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -83,9 +73,10 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;
        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -103,32 +94,28 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
-    // Load state (rng, logits, embedding and kv_cache) from file
+    printf("\nsecond run: %s", params.prompt.c_str());
    {
        FILE *fp_read = fopen("dump_state.bin", "rb");
        if (state_size != llama_get_state_size(ctx2)) {
            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }
-        const size_t ret = fread(state_mem, 1, state_size, fp_read);
+    // load state (rng, logits, embedding and kv_cache) from file
-        if (ret != state_size) {
+    {
        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
        FILE * fp_read = fopen("dump_state.bin", "rb");
        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
        if (ret != state_mem.size()) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }
-        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        llama_set_state_data(ctx2, state_mem.data());
        fclose(fp_read);
    }
    delete[] state_mem;
    // restore state (last tokens)
    last_n_tokens_data = last_n_tokens_data_saved;
    n_past = n_past_saved;
    // second run
@ -143,10 +130,11 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str.c_str());
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        result1 += next_token_str;
        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@ -155,10 +143,17 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }
-    printf("\n\n");
+    printf("\n");
    llama_free(ctx2);
    llama_free_model(model);
    if (result0 != result1) {
        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
        return 1;
    }
    fprintf(stderr, "\n%s : success\n", __func__);
    return 0;
 }
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -24,6 +24,10 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
 -   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 -   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 -   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 ## Build
@ -106,25 +110,25 @@ node index.js
 ## API Endpoints
-   **POST** `/completion`: Given a prompt, it returns the predicted completion.
+-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
    *Options:*
    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
    `temperature`: Adjust the randomness of the generated text (default: 0.8).
    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
-    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
-    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@ -158,6 +162,44 @@ node index.js
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
    *Result JSON:*
    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
    `model`: The path to the model loaded with `-m`
    `prompt`: The provided `prompt`
    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 -   **POST** `/tokenize`: Tokenize a given text.
    *Options:*
@ -188,8 +230,32 @@ node index.js
    It also accepts all the options of `/completion` except `stream` and `prompt`.
 -   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 ## More examples
 ### Change system prompt on runtime
 To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
 `prompt`: Specify a context that you want all connecting clients to respect.
 `anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
 `assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
 ```json
 {
    "system_prompt": {
        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
        "anti_prompt": "User:",
        "assistant_name": "Assistant:"
    }
 }
 ```
 **NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
 ### Interactive mode
 Check the sample in [chat.mjs](chat.mjs).
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -8,6 +8,7 @@ import json
 app = Flask(__name__)
 slot_id = -1
 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "stop")): postData["stop"] += body["stop"]
    postData["n_keep"] = -1
    postData["stream"] = stream
-
+    postData["cache_prompt"] = True
    postData["slot_id"] = slot_id
    return postData
 def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
    slot_id = data["slot_id"]
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -7,6 +7,11 @@ const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema"
 );
 const no_cached_prompt = args.find(
    (_, index) => args[index - 1] === "--no-cache-prompt"
 ) ?? "false";
 const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
 // Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
    grammar = readFileSync(grammarFile, 'utf-8')
 }
 // for cached prompt
 let slot_id = -1;
 const API_URL = 'http://127.0.0.1:8080'
 const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
            top_p: 0.9,
            n_keep: n_keep,
            n_predict: 256,
            cache_prompt: no_cached_prompt === "false",
            slot_id: slot_id,
            stop: ["\n### Human:"], // stop completion after generating this
            grammar,
            stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
        const t = Buffer.from(chunk).toString('utf8')
        if (t.startsWith('data: ')) {
            const message = JSON.parse(t.substring(6))
            slot_id = message.slot_id
            answer += message.content
            process.stdout.write(message.content)
            if (message.stop) {
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -125,6 +125,7 @@
      background-color: #222;
      color: #ddd;
    }
    code {
      font-family: monospace;
      padding: 0.1em 0.3em;
@ -141,7 +142,8 @@
      display: inline;
    }
-    header, footer {
+    header,
    footer {
      text-align: center;
    }
@ -163,6 +165,7 @@
      0% {
        background-position: 0%;
      }
      100% {
        background-position: 100%;
      }
@ -181,6 +184,7 @@
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
      .popover-content {
        background-color: black;
      }
@ -194,6 +198,8 @@
    import { llama } from '/completion.js';
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
    let selected_image = false;
    var slot_id = -1;
    const session = signal({
      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@ -203,6 +209,7 @@
      type: "chat",  // "chat" | "completion"
      char: "Llama",
      user: "User",
      image_selected: ''
    })
    const params = signal({
@ -220,7 +227,9 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
-      n_probs: 0, // no completion_probabilities
+      n_probs: 0, // no completion_probabilities,
      image_data: [],
      cache_prompt: true
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -270,6 +279,7 @@
      // saved templates were successfuly imported.
      console.log('Processing saved templates and updating default template')
      params.value = { ...params.value, image_data: [] };
      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
@ -294,7 +304,9 @@
    function userTemplateApply(t) {
      session.value = t.data.session;
      session.value = { ...session.value, image_selected: '' };
      params.value = t.data.params;
      params.value = { ...params.value, image_data: [] };
    }
    function userTemplateResetToDefaultAndApply() {
@ -385,20 +397,25 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;
        if (data.stop) {
          while (
            currentMessages.length > 0 &&
            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-            ) {
+          ) {
            currentMessages.pop();
          }
          transcriptUpdate([...history, [char, currentMessages]])
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          slot_id = data.slot_id;
          if (selected_image && !data.multimodal) {
            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
            return;
          }
          transcriptUpdate([...history, [char, currentMessages]])
        }
@ -419,7 +436,7 @@
      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
-      const prompt = template(session.value.template, {
+      let prompt = template(session.value.template, {
        message: msg,
        history: session.value.transcript.flatMap(
          ([name, data]) =>
@ -434,9 +451,12 @@
            )
        ).join("\n"),
      });
-
+      if (selected_image) {
        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
      }
      await runLlama(prompt, {
        ...params.value,
        slot_id: slot_id,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
      }, "{{char}}");
    }
@ -446,10 +466,11 @@
        console.log('already running...');
        return;
      }
-      const {prompt} = session.value;
+      const { prompt } = session.value;
      transcriptUpdate([...session.value.transcript, ["", prompt]]);
      await runLlama(prompt, {
        ...params.value,
        slot_id: slot_id,
        stop: [],
      }, "");
    }
@ -467,6 +488,27 @@
      transcriptUpdate([]);
    }
    const uploadImage = (e) => {
      e.preventDefault();
      document.getElementById("fileInput").click();
      document.getElementById("fileInput").addEventListener("change", function (event) {
        const selectedFile = event.target.files[0];
        if (selectedFile) {
          const reader = new FileReader();
          reader.onload = function () {
            const image_data = reader.result;
            session.value = { ...session.value, image_selected: image_data };
            params.value = {
              ...params.value, image_data: [
                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
            }
          };
          selected_image = true;
          reader.readAsDataURL(selectedFile);
        }
      });
    }
    function MessageInput() {
      const message = useSignal("")
@ -497,6 +539,7 @@
          </div>
          <div class="right">
            <button type="submit" disabled=${generating.value}>Send</button>
            <button onclick=${uploadImage}>Upload Image</button>
            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
@ -540,7 +583,7 @@
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
-        if(user) {
+        if (user) {
          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
        } else {
          return html`<p key=${index}>${message}</p>`
@ -549,6 +592,7 @@
      return html`
        <section id="chat" ref=${container}>
          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
          ${messages.flatMap(chatLine)}
        </section>`;
    };
@ -567,7 +611,7 @@
          const converter = new SchemaConverter(
            grammarJsonSchemaPropOrder.value
              .split(',')
-              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
          )
          converter.visit(schema, '')
          params.value = {
@ -579,7 +623,7 @@
        }
      }
-      const FloatField = ({label, max, min, name, step, value}) => {
+      const FloatField = ({ label, max, min, name, step, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -589,7 +633,7 @@
        `
      };
-      const IntField = ({label, max, min, name, value}) => {
+      const IntField = ({ label, max, min, name, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -672,7 +716,7 @@
            ${GrammarControl()}
          </fieldset>
      `
-    );
+      );
      const CompletionConfigForm = () => (
        html`
@ -694,20 +738,20 @@
          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
          <fieldset class="two">
-            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
-            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
+            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
-            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
-            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
-            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
          </fieldset>
          <details>
            <summary>More options</summary>
            <fieldset class="two">
-              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
+              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
-              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
-              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
-              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
            </fieldset>
            <hr />
            <fieldset class="three">
@ -716,11 +760,11 @@
                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
              </div>
-              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
-              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
            </fieldset>
            <fieldset>
-              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
          </details>
        </form>
@ -759,20 +803,20 @@
        const popoverChildren = html`
          <div class="prob-set">
            ${probs.map((p, index) => {
-              return html`
+          return html`
                <div
                  key=${index}
                  title=${`prob: ${p.prob}`}
                  style=${{
-                    padding: '0.3em',
+              padding: '0.3em',
-                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
-                  }}
+            }}
                >
                  <span>${p.tok_str}: </span>
                  <span>${Math.floor(p.prob * 100)}%</span>
                </div>
              `
-            })}
+        })}
          </div>
        `
@ -851,9 +895,9 @@
              ref=${popoverRef}
              class="popover-content"
              style=${{
-                top: position.value.top,
+            top: position.value.top,
-                left: position.value.left,
+            left: position.value.left,
-              }}
+          }}
            >
              ${props.popoverChildren}
            </div>
@ -952,8 +996,11 @@
 </head>
 <body>
-  <div id="container"></div>
+  <div id="container">
    <input type="file" id="fileInput" accept="image/*" style="display: none;">
  </div>
  <div id="portal"></div>
 </body>
 </html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(512, 0);
+    llama_batch batch = llama_batch_init(512, 0, 1);
    // evaluate the initial prompt
    batch.n_tokens = tokens_list.size();
@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
            // is it an end of stream?
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                LOG_TEE("\n");
                break;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -2,13 +2,25 @@
 #include "common.h"
 #include "llama.h"
 #include "grammar-parser.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 struct seq_draft {
    bool active   = false;
    bool drafting = false;
    bool skip     = false;
    int i_batch_dft = 0;
    std::vector<int> i_batch_tgt;
    std::vector<llama_token> tokens;
    struct llama_sampling_context * ctx_sampling;
 };
 int main(int argc, char ** argv) {
    gpt_params params;
@ -21,6 +33,13 @@ int main(int argc, char ** argv) {
        return 1;
    }
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;
    // TODO: make this configurable
    const float p_accept = 0.80f;
    const float p_split  = 0.10f;
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@ -77,8 +96,6 @@ int main(int argc, char ** argv) {
    const auto t_enc_end = ggml_time_us();
    // the 2 models should have the same vocab
    const int n_ctx   = llama_n_ctx(ctx_tgt);
    const int n_vocab = llama_n_vocab(model_tgt);
    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
    // how many tokens to draft each time
@ -91,116 +108,128 @@ int main(int argc, char ** argv) {
    int n_past_tgt = inp.size();
    int n_past_dft = inp.size();
    std::vector<llama_token> drafted;
    std::vector<llama_token> last_tokens(n_ctx);
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    for (auto & id : inp) {
        last_tokens.erase(last_tokens.begin());
        last_tokens.push_back(id);
    }
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    // used to determine end of generation
    bool has_eos = false;
-    // grammar stuff
+    // target model sampling context
-    struct llama_grammar * grammar_dft = NULL;
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
    struct llama_grammar * grammar_tgt = NULL;
-    grammar_parser::parse_state parsed_grammar;
+    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);
-    // if requested - load the grammar, error checking is omitted for brevity
+    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    if (!params.grammar.empty()) {
+    params.sparams.temp = std::max(0.01f, params.sparams.temp);
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+    for (int s = 0; s < n_seq_dft; ++s) {
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
    }
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
    const auto t_dec_start = ggml_time_us();
-    while (true) {
+    // sample from the last token of the prompt
-        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
+    drafts[0].i_batch_tgt.resize(1);
    drafts[0].i_batch_tgt[0] = 0;
-        int i_dft = 0;
+    while (true) {
        // print current draft sequences
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
            }
            const auto & tokens = drafts[s].tokens;
            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
        }
        int i_dft  = 0;
        int s_keep = 0;
        while (true) {
            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-            // remember which tokens were sampled - used for repetition penalties during sampling
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
            printf("%s", token_str.c_str());
            fflush(stdout);
-            if (id == llama_token_eos(ctx_tgt)) {
+            if (id == llama_token_eos(model_tgt)) {
                has_eos = true;
            }
            ++n_predict;
-            // check if the draft matches the target
+            // check if the target token matches any of the drafts
            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                ++n_accept;
                ++n_past_tgt;
                ++n_past_dft;
                ++i_dft;
                continue;
            }
            // the drafted token was rejected or we are out of drafted tokens
            if (i_dft < (int) drafted.size()) {
                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
            } else {
                LOG("out of drafted tokens\n");
            }
            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
            ++n_past_dft;
            // heuristic for n_draft
            {
-                const int  n_draft_cur  = (int) drafted.size();
+                bool matches = false;
                const bool all_accepted = i_dft == n_draft_cur;
-                LOG("n_draft      = %d\n", n_draft);
+                for (int s = 0; s < n_seq_dft; ++s) {
-                LOG("n_draft_cur  = %d\n", n_draft_cur);
+                    if (!drafts[s].active) {
-                LOG("i_dft        = %d\n", i_dft);
+                        continue;
-                LOG("all_accepted = %d\n", all_accepted);
+                    }
-                if (all_accepted && n_draft == n_draft_cur) {
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
-                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
-                    n_draft = std::min(30, n_draft + 8);
+
-                } else if (all_accepted) {
+                        s_keep = s;
-                    LOG(" - partially drafted tokens accepted - no change\n");
+                        matches = true;
-                } else {
+                    } else {
-                    LOG(" - drafted token rejected - n_draft -= 1\n");
+                        drafts[s].active = false;
-                    n_draft = std::max(2, n_draft - 1);
+                    }
                }
                if (matches) {
                    ++n_accept;
                    ++n_past_tgt;
                    ++n_past_dft;
                    ++i_dft;
                    continue;
                }
            }
-            drafted.clear();
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
-            drafted.push_back(id);
+
            // TODO: simplify
            {
                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
                llama_kv_cache_seq_keep(ctx_dft, s_keep);
                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
                llama_kv_cache_seq_keep(ctx_dft, 0);
                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
                llama_kv_cache_seq_keep(ctx_tgt, 0);
            }
            for (int s = 0; s < n_seq_dft; ++s) {
                drafts[s].active = false;
                drafts[s].tokens.clear();
                drafts[s].i_batch_tgt.clear();
            }
            // note: will be erased after the speculation phase
            drafts[0].tokens.push_back(id);
            drafts[0].i_batch_tgt.push_back(0);
            llama_batch_clear(batch_dft);
            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            llama_decode         (ctx_dft, batch_dft);
            ++n_past_dft;
            break;
        }
@ -209,78 +238,151 @@ int main(int argc, char ** argv) {
            break;
        }
-        if (grammar_tgt) {
+        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
            if (grammar_dft) {
                llama_grammar_free(grammar_dft);
            }
            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
            //       that will need to change.
            auto it = ctx_sampling.sequence_contexts.find(0);
            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
            // This is necessary because each sequence id in sequence_contexts
            // uses a copy of the original grammar.
            grammar_dft = llama_grammar_copy(it->second.grammar);
-            LOG("copied target grammar to draft grammar\n");
+        int n_seq_cur  = 1;
        }
        // sample n_draft tokens from the draft model using greedy decoding
        int n_past_cur = n_past_dft;
        for (int s = 0; s < n_seq_dft; ++s) {
            drafts[s].active   = false;
            drafts[s].drafting = false;
        }
        drafts[0].active      = true;
        drafts[0].drafting    = true;
        drafts[0].i_batch_dft = 0;
        llama_batch_clear(batch_tgt);
        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
        // sample n_draft tokens from the draft model using tree-based sampling
        for (int i = 0; i < n_draft; ++i) {
-            float * logits = llama_get_logits(ctx_dft);
+            batch_dft.n_tokens = 0;
-            candidates.clear();
+            for (int s = 0; s < n_seq_dft; ++s) {
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                drafts[s].skip = false;
                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
            }
-            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+            for (int s = 0; s < n_seq_dft; ++s) {
                if (!drafts[s].drafting || drafts[s].skip) {
                    continue;
                }
-            if (grammar_dft != NULL) {
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
+
                const auto & cur_p = drafts[s].ctx_sampling->cur;
                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                }
                if (cur_p[0].p < p_accept) {
                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                    drafts[s].drafting = false;
                    continue;
                }
                std::vector<int> sa(1, s);
                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
                        // all previous tokens from this branch are now also part of the new branch
                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
                                if (batch_tgt.seq_id[t][p] == s) {
                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
                                    batch_tgt.n_seq_id[t]++;
                                    break;
                                }
                            }
                        }
                        // copy the draft state
                        drafts[n_seq_cur].active   = true;
                        drafts[n_seq_cur].drafting = true;
                        drafts[n_seq_cur].skip     = true;
                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
                        sa.push_back(n_seq_cur);
                        n_seq_cur++;
                    } else {
                        break;
                    }
                }
                // add drafted token for each sequence
                for (int is = 0; is < (int) sa.size(); ++is) {
                    const llama_token id = cur_p[is].id;
                    const int s = sa[is];
                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
                    drafts[s].tokens.push_back(id);
                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
                    // add the token to the batch for batched decoding with the draft model
                    drafts[s].i_batch_dft = batch_dft.n_tokens;
                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
                    if (batch_tgt.n_tokens > n_draft) {
                        drafts[s].drafting = false;
                    }
                }
            }
-            // computes softmax and sorts the candidates
+            // no sequence is drafting anymore
-            llama_sample_softmax(ctx_dft, &cur_p);
+            if (batch_dft.n_tokens == 0) {
            for (int i = 0; i < 3; ++i) {
                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
            }
            // TODO: better logic?
            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
                break;
            }
-            // drafted token
+            // evaluate the drafted tokens on the draft model
-            const llama_token id = cur_p.data[0].id;
+            llama_decode(ctx_dft, batch_dft);
-
+            ++n_past_cur;
            drafted.push_back(id);
            ++n_drafted;
-            // no need to evaluate the last drafted token, since we won't use the result
+            if (batch_tgt.n_tokens > n_draft) {
            if (i == n_draft - 1) {
                break;
            }
            // evaluate the drafted token on the draft model
            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
            ++n_past_cur;
            if (grammar_dft != NULL) {
                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
            }
        }
        // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
+        {
-        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
+            llama_kv_cache_seq_keep(ctx_tgt, 0);
-        ++n_past_tgt;
+            for (int s = 1; s < n_seq_dft; ++s) {
                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
            }
-        // the first token is always proposed by the traget model before the speculation loop
+            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
-        drafted.erase(drafted.begin());
+            llama_decode(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }
        // the first token is always proposed by the traget model before the speculation loop so we erase it here
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
            }
            drafts[s].tokens.erase(drafts[s].tokens.begin());
        }
    }
    auto t_dec_end = ggml_time_us();
@ -288,9 +390,8 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");
    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
    // TODO: make sure these numbers are computed correctly
    LOG_TEE("\n");
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
@ -304,16 +405,19 @@ int main(int argc, char ** argv) {
    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx_tgt);
    llama_sampling_free(ctx_sampling);
    for (int s = 0; s < n_seq_dft; ++s) {
        llama_sampling_free(drafts[s].ctx_sampling);
    }
    llama_batch_free(batch_dft);
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);
    llama_free(ctx_dft);
    llama_free_model(model_dft);
    if (grammar_dft != NULL) {
        llama_grammar_free(grammar_dft);
        llama_grammar_free(grammar_tgt);
    }
    llama_backend_free();
    fprintf(stderr, "\n\n");
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
    set_param_model(model);
    // measure data size
-    struct ggml_allocr * alloc = NULL;
+    size_t size = 0;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-    alloc_model(alloc, model);
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
    }
    // allocate data
-    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
+    struct ggml_allocr * alloc = NULL;
-    ggml_allocr_free(alloc);
+    model->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
    alloc_model(alloc, model);
    ggml_allocr_free(alloc);
@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
-    ggml_allocr_alloc(alloc, tokens_input);
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
-    ggml_allocr_alloc(alloc, target_probs);
+                            tensor_alignment;
    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
    ggml_allocr_free(alloc);
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // allocate input tensors
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -29,6 +29,8 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
    const half * x = (const half *) vx;
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / channel_x_divisor;
-    const int nrows_y = ncols_x;
+    const int nrows_y   = ncols_x;
    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
+    const int row_dst   = row_x;
    const int idst = channel*nrows_dst + row_dst;
@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
            break;
        }
        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const float xi = __half2float(x[ix]);
        const int row_y = col_x;
        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const int iy = channel*nrows_y + row_y;
        const float xi = __half2float(x[ix]);
        tmp += xi * y[iy];
    }
@ -5662,10 +5664,10 @@ void ggml_init_cublas() {
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;
@ -5675,15 +5677,15 @@ void ggml_init_cublas() {
            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        }
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
            g_tensor_split[id] /= total_vram;
        }
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
            CUDA_CHECK(ggml_cuda_set_device(id));
            // create cuda streams
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            for (int is = 0; is < MAX_STREAMS; ++is) {
                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
            }
@ -6252,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
-    GGML_ASSERT(src0_dd_i != nullptr);
+    GGML_ASSERT(src0_dd_i  != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne0 = dst->ne[0];
    const int64_t row_diff = row_high - row_low;
    int id;
@ -7013,7 +7014,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
 }
 static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
+    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -7023,11 +7025,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne12 = src1->ne[2];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
    const int64_t ne12 = src1->ne[2];
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@ -7046,6 +7048,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
    half * src0_as_f16 = (half *) src0_ddq;
    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
    // convert src1 to fp16
    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
    GGML_ASSERT(to_fp16_cuda != nullptr);
    size_t src1_as = 0;
    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
    size_t dst_as = 0;
    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
    const half alpha_f16 = 1.0f;
    const half beta_f16  = 0.0f;
 #if 0
    // use cublasGemmEx
    {
        for (int i13 = 0; i13 < ne13; ++i13) {
            for (int i12 = 0; i12 < ne12; ++i12) {
                int i03 = i13 / r3;
                int i02 = i12 / r2;
                CUBLAS_CHECK(
                        cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                            ne01, ne11, ne10,
                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
                            CUBLAS_COMPUTE_16F,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
            }
        }
    }
 #else
    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
                            (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
                &beta_f16,  (      char *)     dst_f16, CUDA_R_16F, ne01,                dst->nb[2]/sizeof(float), // strideC
                ne12*ne13,
                CUBLAS_COMPUTE_16F,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    } else {
        // use cublasGemmBatchedEx
        // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
        const int ne23 = ne12*ne13;
        // TODO: avoid this alloc
        void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
        for (int i13 = 0; i13 < ne13; ++i13) {
            for (int i12 = 0; i12 < ne12; ++i12) {
                int i03 = i13 / r3;
                int i02 = i12 / r2;
                ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3];
                ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
                ptrs[2*ne23 + i12 + i13*ne12] = (char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
            }
        }
        // allocate device memory for pointers
        void ** ptrs_as = nullptr;
        CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
        // TODO: this does not work for some reason -- not sure why?
        //size_t ptrs_s = 0;
        //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
        // copy pointers to device
        CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
        free(ptrs);
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
                            (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
                &beta_f16,  (      void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
                ne23,
                CUBLAS_COMPUTE_16F,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
        // free device memory for pointers
        CUDA_CHECK(cudaFree(ptrs_as));
        //ggml_cuda_pool_free(ptrs_as, ptrs_s);
    }
 #endif
    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
    ggml_cuda_pool_free(src1_as_f16, src1_as);
    ggml_cuda_pool_free(dst_f16, dst_as);
 }
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
        src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
@ -7058,10 +7213,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        }
    }
    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
+    } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
    } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -62,6 +62,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul);
    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(scale);
    GGML_METAL_DECL_KERNEL(scale_4);
    GGML_METAL_DECL_KERNEL(silu);
    GGML_METAL_DECL_KERNEL(relu);
    GGML_METAL_DECL_KERNEL(gelu);
@ -73,6 +74,8 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
    GGML_METAL_DECL_KERNEL(get_rows_q5_0);
    GGML_METAL_DECL_KERNEL(get_rows_q5_1);
    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
@ -87,6 +90,8 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
@ -97,6 +102,8 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
@ -243,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul);
        GGML_METAL_ADD_KERNEL(mul_row);
        GGML_METAL_ADD_KERNEL(scale);
        GGML_METAL_ADD_KERNEL(scale_4);
        GGML_METAL_ADD_KERNEL(silu);
        GGML_METAL_ADD_KERNEL(relu);
        GGML_METAL_ADD_KERNEL(gelu);
@ -254,6 +262,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
        GGML_METAL_ADD_KERNEL(get_rows_q5_0);
        GGML_METAL_ADD_KERNEL(get_rows_q5_1);
        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
@ -268,6 +278,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
@ -278,8 +290,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
@ -335,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul);
    GGML_METAL_DEL_KERNEL(mul_row);
    GGML_METAL_DEL_KERNEL(scale);
    GGML_METAL_DEL_KERNEL(scale_4);
    GGML_METAL_DEL_KERNEL(silu);
    GGML_METAL_DEL_KERNEL(relu);
    GGML_METAL_DEL_KERNEL(gelu);
@ -346,6 +361,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_f16);
    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
    GGML_METAL_DEL_KERNEL(get_rows_q5_0);
    GGML_METAL_DEL_KERNEL(get_rows_q5_1);
    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
@ -360,6 +377,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
@ -370,8 +389,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
@ -905,15 +926,20 @@ void ggml_metal_graph_compute(
                            const float scale = *(const float *) src1->data;
-                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            int64_t n = ggml_nelements(dst);
                            if (n % 4 == 0) {
                                n /= 4;
                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
                            } else {
                                [encoder setComputePipelineState:ctx->pipeline_scale];
                            }
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-                            const int64_t n = ggml_nelements(dst);
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            GGML_ASSERT(n % 4 == 0);
                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
@ -1052,6 +1078,8 @@ void ggml_metal_graph_compute(
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
@ -1121,6 +1149,24 @@ void ggml_metal_graph_compute(
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                        } break;
                                    case GGML_TYPE_Q5_0:
                                        {
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
                                        } break;
                                    case GGML_TYPE_Q5_1:
                                        {
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
                                            GGML_ASSERT(ne02 == 1);
@ -1201,7 +1247,8 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
@ -1233,6 +1280,8 @@ void ggml_metal_graph_compute(
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
                                case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
                                case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -18,6 +18,21 @@ typedef struct {
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 #define QK5_0 32
 typedef struct {
    half d;                // delta
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_0 / 2]; // nibbles / quants
 } block_q5_0;
 #define QK5_1 32
 typedef struct {
    half d;                 // delta
    half m;                 // min
    uint8_t qh[4];          // 5-th bit of quants
    uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
 #define QK8_0 32
 typedef struct {
    half    d;         // delta
@ -110,9 +125,17 @@ kernel void kernel_mul_row(
 }
 kernel void kernel_scale(
        device const float * src0,
        device       float * dst,
        constant     float & scale,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * scale;
 }
 kernel void kernel_scale_4(
        device const float4 * src0,
        device       float4 * dst,
-        constant     float & scale,
+        constant     float  & scale,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * scale;
 }
@ -399,8 +422,11 @@ kernel void kernel_rms_norm(
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float2 acc = 0.f;
    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                + yl[i + 1] * (qs[i / 2] & 0x0F00);
@ -417,8 +443,11 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float m = qb_curr->m;
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+
    float2 acc = 0.f;
    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                + yl[i + 1] * (qs[i / 2] & 0x0F00);
@ -428,6 +457,49 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
    return d * (acc[0] + acc[1]) + sumy * m;
 }
 // function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
 // il indicates where the q5 quants begin (0 or QK5_0/4)
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float2 acc = 0.f;
    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
    }
    return d * (sumy * -16.f + acc[0] + acc[1]);
 }
 // function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
 // il indicates where the q5 quants begin (0 or QK5_1/4)
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float m = qb_curr->m;
    float2 acc = 0.f;
    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
    }
    return d * (acc[0] + acc[1]) + sumy * m;
 }
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
@ -525,6 +597,43 @@ kernel void kernel_mul_mv_q4_1_f32(
     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 kernel void kernel_mul_mv_q5_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01[[buffer(4)]],
        constant   int64_t & ne02[[buffer(5)]],
        constant   int64_t & ne10[[buffer(9)]],
        constant   int64_t & ne12[[buffer(11)]],
        constant   int64_t & ne0[[buffer(15)]],
        constant   int64_t & ne1[[buffer(16)]],
        constant   uint    & gqa[[buffer(17)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 kernel void kernel_mul_mv_q5_1_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01[[buffer(4)]],
        constant   int64_t & ne02[[buffer(5)]],
        constant   int64_t & ne10[[buffer(9)]],
        constant   int64_t & ne12[[buffer(11)]],
        constant   int64_t & ne0[[buffer(15)]],
        constant   int64_t & ne1[[buffer(16)]],
        constant   uint    & gqa[[buffer(17)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 #define NB_Q8_0 8
 kernel void kernel_mul_mv_q8_0_f32(
@ -2149,6 +2258,62 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
    }
 }
 template <typename type4x4>
 void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
    const float d = xb->d;
    const float md = -16.h * xb->d;
    const ushort mask = il ? 0x00F0 : 0x000F;
    const uint32_t qh = *((device const uint32_t *)xb->qh);
    const int x_mv = il ? 4 : 0;
    const int gh_mv = il ? 12 : 0;
    const int gh_bk = il ?  0 : 4;
    for (int i = 0; i < 8; i++) {
        // extract the 5-th bits for x0 and x1
        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
        // combine the 4-bits from qs with the 5th bit
        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
        reg[i/2][2*(i%2)+0] = d * x0 + md;
        reg[i/2][2*(i%2)+1] = d * x1 + md;
    }
 }
 template <typename type4x4>
 void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
    const float d = xb->d;
    const float m = xb->m;
    const ushort mask = il ? 0x00F0 : 0x000F;
    const uint32_t qh = *((device const uint32_t *)xb->qh);
    const int x_mv = il ? 4 : 0;
    const int gh_mv = il ? 12 : 0;
    const int gh_bk = il ?  0 : 4;
    for (int i = 0; i < 8; i++) {
        // extract the 5-th bits for x0 and x1
        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
        // combine the 4-bits from qs with the 5th bit
        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
        reg[i/2][2*(i%2)+0] = d * x0 + m;
        reg[i/2][2*(i%2)+1] = d * x1 + m;
    }
 }
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
    device const int8_t * qs = ((device const int8_t *)xb->qs);
@ -2490,6 +2655,8 @@ template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
 template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
 template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
@ -2518,6 +2685,8 @@ template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<f
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
 template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
 template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
 template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
 template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nb10 = src1->nb[0];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    size_t x_size;
    size_t d_size;
-    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
+    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
+    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            const int i0 = i03*ne02 + i02;
            cl_event ev;
            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-            if (nb10 == sizeof(float)) {
+            const int64_t i13 = i03%ne13;
-                // Contiguous, avoid overhead from queueing many kernel runs
+            const int64_t i12 = i02%ne12;
-                const int64_t i13 = i03%ne13;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
                const int64_t i12 = i02%ne12;
                const int i1 = i13*ne12*ne11 + i12*ne11;
-                cl_int x_offset = 0;
+            cl_int x_offset = 0;
-                cl_int y_offset = i1*ne10;
+            cl_int y_offset = i1*ne10;
-                cl_int d_offset = 0;
+            cl_int d_offset = 0;
-                size_t global = ne00 * ne01;
+            size_t global = ne00 * ne01;
-                cl_int ky = ne10;
+            cl_int ky = ne10 * ne11;
                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
            } else {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const int64_t i13 = i03%ne13;
                    const int64_t i12 = i02%ne12;
                    const int64_t i11 = i01%ne11;
                    const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
-                    cl_int x_offset = i01*ne00;
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-                    cl_int y_offset = i1*ne10;
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-                    cl_int d_offset = i01*ne00;
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-                    // compute
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-                    size_t global = ne00;
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-                    cl_int ky = ne10;
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
                }
            }
            CL_CHECK(clReleaseEvent(ev));
            CL_CHECK(clFinish(queue));
@ -1518,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
    size_t x_offset = 0;
    int64_t pi02 = -1;
    int64_t pi03 = -1;
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        int64_t i03 = i13 / r3;
+        // TODO: copy src0 here when r3>1
        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                if (src0->backend == GGML_BACKEND_GPU) {
                    x_offset = (i03 * ne02 + i02) * x_ne;
                } else {
                    // copy src0 to device
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                }
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-            int64_t i02 = i12 / r2;
+                    // copy src1 to device
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-            // copy data to device
+                    CL_CHECK(clFinish(queue));
-            if (src0->backend == GGML_BACKEND_GPU) {
+
-                x_offset = (i03 * ne02 + i02) * x_ne;
+                    // compute
-            } else if (i02 != pi02 || i03 != pi03) {
+                    cl_event ev_sgemm;
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                pi02 = i02;
+                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                pi03 = i03;
+                                                               ne01, ne11, ne10,
                                                               alpha,
                                                               d_X, x_offset, ne00,
                                                               d_Y, 0, ne10,
                                                               beta,
                                                               d_D, 0, ne01,
                                                               &queue, &ev_sgemm);
                    if (status != clblast::StatusCode::kSuccess) {
                        GGML_ASSERT(false);
                    }
                    // copy dst to host
                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
                }
            }
            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
            CL_CHECK(clFinish(queue));
            // compute
            cl_event ev_sgemm;
            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
                                                       d_X, x_offset, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
                                                       &queue, &ev_sgemm);
            if (status != clblast::StatusCode::kSuccess) {
                GGML_ASSERT(false);
            }
            // copy dst to host
            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
        }
    }
@ -1568,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    ggml_cl_pool_free(d_D, d_size);
 }
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
    GGML_ASSERT(fp16_support);
    const int64_t ne00 = src0->ne[0];
@ -1598,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;
    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
    size_t x_size;
    size_t y_size;
    size_t d_size;
@ -1614,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
    size_t x_offset = 0;
    int64_t pi02 = -1;
    int64_t pi03 = -1;
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        int64_t i03 = i13 / r3;
+        // TODO: copy src0 here when r3>1
-
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
-            int64_t i02 = i12 / r2;
+                if (src0->backend == GGML_BACKEND_GPU) {
-
+                    x_offset = (i03 * ne02 + i02) * x_ne;
-            // copy src0 to device
+                } else {
-            if (src0->backend == GGML_BACKEND_GPU) {
+                    // copy src0 to device
-                x_offset = (i03 * ne02 + i02) * x_ne;
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
            } else if (i02 != pi02 || i03 != pi03) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                pi02 = i02;
                pi03 = i03;
            }
            // convert src1 to fp16
            // TODO: use multiple threads
            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
            if (src1_cont_rows) {
                if (src1_cont_cols) {
                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                }
-                else {
+
-                    for (int64_t i11 = 0; i11 < ne11; i11++) {
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                    // convert src1 to fp16
                    // TODO: use multiple threads
                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
                    if (src1_cont_rows) {
                        if (src1_cont_cols) {
                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                        }
                        else {
                            for (int64_t i11 = 0; i11 < ne11; i11++) {
                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
                            }
                        }
                    }
-                }
+                    else {
-            }
+                        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            else {
+                            for (int64_t i10 = 0; i10 < ne10; i10++) {
-                for (int64_t i11 = 0; i11 < ne11; i11++) {
+                                // very slow due to no inlining
-                    for (int64_t i10 = 0; i10 < ne10; i10++) {
+                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
-                        // very slow due to no inlining
+                            }
-                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                        }
                    }
                    // copy src1 to device
                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
                    CL_CHECK(clFinish(queue));
                    // compute
                    cl_event ev_sgemm;
                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                               ne01, ne11, ne10,
                                                               alpha,
                                                               d_X, x_offset, ne00,
                                                               d_Y, 0, ne10,
                                                               beta,
                                                               d_D, 0, ne01,
                                                               &queue, &ev_sgemm);
                    if (status != clblast::StatusCode::kSuccess) {
                        GGML_ASSERT(false);
                    }
                    // copy dst to host, then convert to float
                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
                }
            }
            // copy src1 to device
            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
            CL_CHECK(clFinish(queue));
            // compute
            cl_event ev_sgemm;
            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
                                                       d_X, x_offset, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
                                                       &queue, &ev_sgemm);
            if (status != clblast::StatusCode::kSuccess) {
                GGML_ASSERT(false);
            }
            // copy dst to host, then convert to float
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
            ggml_fp16_to_fp32_row(tmp, d, d_ne);
        }
    }
@ -1744,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    size_t ev_idx = 0;
    std::vector<cl_event> events;
-    int64_t pi02 = -1;
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
-    int64_t pi03 = -1;
+        // TODO: copy and dequantize src0 here when r3>1
-
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
-        int64_t i03 = i13 / r3;
+                // copy src0 to device if necessary
-
+                if (src0->backend == GGML_BACKEND_CPU) {
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            int64_t i02 = i12 / r2;
            // copy src0 to device if necessary
            if (src0->backend == GGML_BACKEND_CPU) {
                if (i02 != pi02 || i03 != pi03) {
                    events.emplace_back();
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                    pi02 = i02;
+                } else if (src0->backend == GGML_BACKEND_GPU) {
-                    pi03 = i03;
+                    d_Q = (cl_mem) src0->extra;
-                }
+                } else {
            } else if (src0->backend == GGML_BACKEND_GPU) {
                d_Q = (cl_mem) src0->extra;
            } else {
                GGML_ASSERT(false);
            }
            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                // copy src1 to device
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
                // compute
                const size_t global = ne01 * local;
                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                const cl_int ncols = ne00;
                events.emplace_back();
                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
                // convert src0 to fp32 on device
                const size_t global = x_ne / global_denom;
                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
                // copy src1 to device
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
                events.emplace_back();
                // wait for conversion
                CL_CHECK(clFinish(queue));
                // compute
                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                           ne01, ne11, ne10,
                                                           alpha,
                                                           d_X, 0, ne00,
                                                           d_Y, 0, ne10,
                                                           beta,
                                                           d_D, 0, ne01,
                                                           &queue, events.data() + ev_idx++);
                if (status != clblast::StatusCode::kSuccess) {
                    GGML_ASSERT(false);
                }
            }
-            // copy dst to host
+                if (!mul_mat_vec) {
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    // convert src0 to fp32 on device
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+                    const size_t global = x_ne / global_denom;
-            for (auto *event : events) {
+                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                clReleaseEvent(event);
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-            }
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
                }
-            ev_idx = 0;
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-            events.clear();
+                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                        // copy src1 to device
                        events.emplace_back();
                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
                        // compute
                        const size_t global = ne01 * local;
                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                        const cl_int ncols = ne00;
                        events.emplace_back();
                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
                    } else { // CLBlast matrix matrix multiplication
                        // copy src1 to device
                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
                        // wait for conversion
                        CL_CHECK(clFinish(queue));
                        // compute
                        events.emplace_back();
                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                                   ne01, ne11, ne10,
                                                                   alpha,
                                                                   d_X, 0, ne00,
                                                                   d_Y, 0, ne10,
                                                                   beta,
                                                                   d_D, 0, ne01,
                                                                   &queue, events.data() + ev_idx++);
                        if (status != clblast::StatusCode::kSuccess) {
                            GGML_ASSERT(false);
                        }
                    }
                    // copy dst to host
                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
                    for (auto *event : events) {
                        clReleaseEvent(event);
                    }
                    ev_idx = 0;
                    events.clear();
                }
            }
        }
    }
@ -1897,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
 }
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
    }
    return 0;
 }
--- a/ggml.c
+++ b/ggml.c
@ -573,7 +573,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 //
 // cache line
 //
@ -1830,7 +1829,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
    return type_traits[type];
 }
 //
 // simd mappings
 //
@ -4059,16 +4057,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "ALIBI",
    "CLAMP",
    "CONV_1D",
    "CONV_1D_STAGE_0",
    "CONV_1D_STAGE_1",
    "CONV_TRANSPOSE_1D",
    "CONV_2D",
    "CONV_2D_STAGE_0",
    "CONV_2D_STAGE_1",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
    "UPSCALE",
    "CONV_1D_STAGE_0",
    "CONV_1D_STAGE_1",
    "FLASH_ATTN",
    "FLASH_FF",
    "FLASH_ATTN_BACK",
@ -4094,7 +4093,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -4145,16 +4144,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "alibi(x)",
    "clamp(x)",
    "conv_1d(x)",
    "conv_1d_stage_0(x)",
    "conv_1d_stage_1(x)",
    "conv_transpose_1d(x)",
    "conv_2d(x)",
    "conv_2d_stage_0(x)",
    "conv_2d_stage_1(x)",
    "conv_transpose_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "upscale(x)",
    "conv_1d_stage_0(x)",
    "conv_1d_stage_1(x)",
    "flash_attn(x)",
    "flash_ff(x)",
    "flash_attn_back(x)",
@ -4180,7 +4180,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4211,8 +4211,10 @@ static void ggml_setup_op_has_task_pass(void) {
        p[GGML_OP_CONV_1D                ] = true;
        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
        p[GGML_OP_CONV_2D                ] = true;
        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
        p[GGML_OP_CONV_2D                ] = true;
        p[GGML_OP_CONV_2D_STAGE_0        ] = true;
        p[GGML_OP_CONV_2D_STAGE_1        ] = true;
        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@ -5958,7 +5960,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
    return ggml_sqrt_impl(ctx, a, true);
 }
 // ggml_log
 static struct ggml_tensor * ggml_log_impl(
@ -6012,7 +6013,6 @@ struct ggml_tensor * ggml_sum(
    return result;
 }
 // ggml_sum_rows
 struct ggml_tensor * ggml_sum_rows(
@ -6644,7 +6644,6 @@ struct ggml_tensor * ggml_set_2d_inplace(
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
 }
 // ggml_cpy
 static struct ggml_tensor * ggml_cpy_impl(
@ -6724,7 +6723,6 @@ struct ggml_tensor * ggml_cont_inplace(
    return ggml_cont_impl(ctx, a, true);
 }
 // make contiguous, with new shape
 GGML_API struct ggml_tensor * ggml_cont_1d(
        struct ggml_context * ctx,
@ -7177,7 +7175,6 @@ struct ggml_tensor * ggml_diag(
    return result;
 }
 // ggml_diag_mask_inf
 static struct ggml_tensor * ggml_diag_mask_inf_impl(
@ -7289,7 +7286,6 @@ struct ggml_tensor * ggml_soft_max_inplace(
    return ggml_soft_max_impl(ctx, a, true);
 }
 // ggml_soft_max_back
 static struct ggml_tensor * ggml_soft_max_back_impl(
@ -7706,7 +7702,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 // ggml_conv_2d
-struct ggml_tensor * ggml_conv_2d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
 static struct ggml_tensor * ggml_conv_2d_stage_0(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
@ -7725,17 +7725,21 @@ struct ggml_tensor * ggml_conv_2d(
        is_node = true;
    }
    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
    const int64_t ne[4] = {
-        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+        a->ne[2] * a->ne[1] * a->ne[0],
-        ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
+        OW,
-        a->ne[3], b->ne[3],
+        OH,
        b->ne[3],
    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
    ggml_set_op_params(result, params, sizeof(params));
-    result->op = GGML_OP_CONV_2D;
+    result->op = GGML_OP_CONV_2D_STAGE_0;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
@ -7744,8 +7748,61 @@ struct ggml_tensor * ggml_conv_2d(
 }
-// ggml_conv_2d_sk_p0
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
 // a: [OC, IC, KH, KW]
 // b: [N, OH, OW, IC * KH * KW]
 // result: [N, OC, OH, OW]
 static struct ggml_tensor * ggml_conv_2d_stage_1(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b) {
    bool is_node = false;
    if (a->grad || b->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }
    const int64_t ne[4] = {
        b->ne[1],
        b->ne[2],
        a->ne[3],
        b->ne[3],
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    result->op = GGML_OP_CONV_2D_STAGE_1;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
    return result;
 }
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OC, OH, OW]
 struct ggml_tensor * ggml_conv_2d(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
    int                  s0,
    int                  s1,
    int                  p0,
    int                  p1,
    int                  d0,
    int                  d1) {
    struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
    result = ggml_conv_2d_stage_1(ctx, a, result);
    return result;
 }
 // ggml_conv_2d_sk_p0
 struct ggml_tensor * ggml_conv_2d_sk_p0(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@ -8184,7 +8241,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
    return result;
 }
 struct ggml_tensor * ggml_add_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@ -8629,8 +8685,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }
 // ggml_cross_entropy_loss
 struct ggml_tensor * ggml_cross_entropy_loss(
@ -9832,7 +9886,6 @@ static void ggml_compute_forward_add1(
    }
 }
 // ggml_compute_forward_acc
 static void ggml_compute_forward_acc_f32(
@ -9972,7 +10025,6 @@ static void ggml_compute_forward_sub_f32(
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 #ifdef GGML_USE_ACCELERATE
            vDSP_vsub(
                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
@ -10153,7 +10205,6 @@ static void ggml_compute_forward_div_f32(
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 #ifdef GGML_USE_ACCELERATE
            UNUSED(ggml_vec_div_f32);
@ -10291,7 +10342,6 @@ static void ggml_compute_forward_sqrt(
    }
 }
 // ggml_compute_forward_log
 static void ggml_compute_forward_log_f32(
@ -12124,7 +12174,6 @@ static void ggml_compute_forward_out_prod_f32(
        }
    }
    //int64_t t1 = ggml_perf_time_us();
    //static int64_t acc = 0;
    //acc += t1 - t0;
@ -12320,7 +12369,6 @@ static void ggml_compute_forward_scale_f32(
    const size_t nb1 = dst->nb[1];
    for (int i1 = ir0; i1 < ir1; i1++) {
        if (dst->data != src0->data) {
            // src0 is same shape as dst => same indices
@ -12718,7 +12766,6 @@ static void ggml_compute_forward_get_rows_back_f32(
    }
 }
 static void ggml_compute_forward_get_rows_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -13541,7 +13588,7 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                    }
-                } if (!is_neox) {
+                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@ -14001,6 +14048,7 @@ static void ggml_compute_forward_conv_1d_f32(
    }
 }
 // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
                             ggml_fp16_t * A,
                             ggml_fp16_t * B,
@ -14302,6 +14350,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
            }
        }
        // need to zero dst since we are accumulating into it
        memset(dst->data, 0, ggml_nbytes(dst));
        return;
    }
@ -14374,7 +14425,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
                    float * dst_data = wdata + i01*ne00*ne02;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
+                        dst_data[i00*ne02 + i02] = src[i00];
                    }
                }
            }
@ -14393,6 +14444,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
            }
        }
        // need to zero dst since we are accumulating into it
        memset(dst->data, 0, ggml_nbytes(dst));
        return;
    }
@ -14454,6 +14508,144 @@ static void ggml_compute_forward_conv_transpose_1d(
 // ggml_compute_forward_conv_2d
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
 static void ggml_compute_forward_conv_2d_stage_0_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F16);
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
    GGML_TENSOR_BINARY_OP_LOCALS;
    const int64_t N = ne13;
    const int64_t IC = ne12;
    const int64_t IH = ne11;
    const int64_t IW = ne10;
    // const int64_t OC = ne03;
    // const int64_t IC = ne02;
    const int64_t KH = ne01;
    const int64_t KW = ne00;
    const int64_t OH = ne2;
    const int64_t OW = ne1;
    const int ith = params->ith;
    const int nth = params->nth;
    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        memset(dst->data, 0, ggml_nbytes(dst));
        return;
    }
    if (params->type == GGML_TASK_FINALIZE) {
        return;
    }
    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
    {
        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
        for (int64_t in = 0; in < N; in++) {
            for (int64_t ioh = 0; ioh < OH; ioh++) {
                for (int64_t iow = 0; iow < OW; iow++) {
                    for (int64_t iic = ith; iic < IC; iic+=nth) {
                        // micro kernel
                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
                        for (int64_t ikh = 0; ikh < KH; ikh++) {
                            for (int64_t ikw = 0; ikw < KW; ikw++) {
                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                }
                            }
                        }
                    }
                }
            }
        }
    }
 }
 // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
 // src0: [OC, IC, KH, KW]
 // src1: [N, OH, OW, IC * KH * KW]
 // result: [N, OC, OH, OW]
 static void ggml_compute_forward_conv_2d_stage_1_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
    if (params->type == GGML_TASK_INIT) {
        return;
    }
    if (params->type == GGML_TASK_FINALIZE) {
        return;
    }
    GGML_TENSOR_BINARY_OP_LOCALS;
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb0  == sizeof(float));
    const int N = ne13;
    const int OH = ne12;
    const int OW = ne11;
    const int OC = ne03;
    const int IC = ne02;
    const int KH = ne01;
    const int KW = ne00;
    const int ith = params->ith;
    const int nth = params->nth;
    int64_t m = OC;
    int64_t n = OH * OW;
    int64_t k = IC * KH * KW;
    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
    for (int i = 0; i < N; i++) {
        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
        float * C = (float *)dst->data + i * m * n; // [m, n]
        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
    }
 }
 static void ggml_compute_forward_conv_2d_f16_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -14466,16 +14658,40 @@ static void ggml_compute_forward_conv_2d_f16_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    // src1: image [N, IC, IH, IW]
    // src0: kernel [OC, IC, KH, KW]
    // dst:  result [N, OC, OH, OW]
    // ne12: IC
    // ne0: OW
    // ne1: OH
    // nk0: KW
    // nk1: KH
    // ne13: N
    const int N = ne13;
    const int IC = ne12;
    const int IH = ne11;
    const int IW = ne10;
    const int OC = ne03;
    // const int IC = ne02;
    const int KH = ne01;
    const int KW = ne00;
    const int OH = ne1;
    const int OW = ne0;
    const int ith = params->ith;
    const int nth = params->nth;
-    const int nk0 = ne00;
+    // const int nk0 = ne00;
-    const int nk1 = ne01;
+    // const int nk1 = ne01;
    // size of the convolution row - the kernel size unrolled across all channels
-    const int ew0 = nk0*nk1*ne02;
+    // const int ew0 = nk0*nk1*ne02;
    // ew0: IC*KH*KW
    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@ -14491,24 +14707,27 @@ static void ggml_compute_forward_conv_2d_f16_f32(
        memset(params->wdata, 0, params->wsize);
        // prepare source data (src1)
        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-            for (int i13 = 0; i13 < ne13; i13++) {
+            for (int in = 0; in < N; in++) {
-                for (int i12 = 0; i12 < ne12; i12++) {
+                for (int iic = 0; iic < IC; iic++) {
-                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
+                    for (int ioh = 0; ioh < OH; ioh++) {
-                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+                        for (int iow = 0; iow < OW; iow++) {
-                    for (int i1 = 0; i1 < ne1; i1++) {
+                            // micro kernel
-                        for (int i0 = 0; i0 < ne0; i0++) {
+                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            for (int ik1 = 0; ik1 < nk1; ik1++) {
+                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
                                for (int ik0 = 0; ik0 < nk0; ik0++) {
                                    const int idx0 = i0*s0 + ik0*d0 - p0;
                                    const int idx1 = i1*s1 + ik1*d1 - p1;
-                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                            for (int ikh = 0; ikh < KH; ikh++) {
-                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                for (int ikw = 0; ikw < KW; ikw++) {
-                                            GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                                    const int iiw = iow*s0 + ikw*d0 - p0;
                                    const int iih = ioh*s1 + ikh*d1 - p1;
                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                    }
                                }
                            }
@ -14525,30 +14744,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
        return;
    }
    // total patches in dst
    const int np = ne2;
    // patches per thread
    const int dp = (np + nth - 1)/nth;
    // patch range for this thread
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);
    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
    // wdata: [N*OH*OW, IC*KH*KW]
    // dst: result [N, OC, OH, OW]
    // src0: kernel [OC, IC, KH, KW]
-    for (int i3 = 0; i3 < ne3; i3++) {
+    int64_t m = OC;
-        for (int i2 = ip0; i2 < ip1; i2++) {
+    int64_t n = OH * OW;
-            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
+    int64_t k = IC * KH * KW;
-            for (int i1 = 0; i1 < ne1; ++i1) {
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-                for (int i0 = 0; i0 < ne0; ++i0) {
+    for (int i = 0; i < N; i++) {
-                    ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-                            (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+        ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
-                            (ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
+        float * C = (float *)dst->data + i * m * n; // [m * k]
-                }
+
-            }
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
        }
    }
 }
@ -14574,6 +14785,48 @@ static void ggml_compute_forward_conv_2d(
    }
 }
 static void ggml_compute_forward_conv_2d_stage_0(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(false);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 static void ggml_compute_forward_conv_2d_stage_1(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
            } break;
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(false);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_conv_transpose_2d
 static void ggml_compute_forward_conv_transpose_2d(
@ -14632,6 +14885,8 @@ static void ggml_compute_forward_conv_transpose_2d(
            }
        }
        memset(dst->data, 0, ggml_nbytes(dst));
        return;
    }
@ -16130,7 +16385,6 @@ static void ggml_compute_forward_add_rel_pos_f32(
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);
    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
        for (int64_t i12 = 0; i12 < ne12; ++i12) {
            for (int64_t i11 = 0; i11 < ne11; ++i11) {
@ -16197,7 +16451,6 @@ static void ggml_compute_forward_map_unary_f32(
    }
 }
 static void ggml_compute_forward_map_unary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -16245,7 +16498,6 @@ static void ggml_compute_forward_map_binary_f32(
    }
 }
 static void ggml_compute_forward_map_binary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -16297,7 +16549,6 @@ static void ggml_compute_forward_map_custom2_f32(
    fun(dst, a, b);
 }
 // ggml_compute_forward_map_custom3
 static void ggml_compute_forward_map_custom3_f32(
@ -16572,7 +16823,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
        ggml_vec_sub_f32(nc, ds0, ds0, s1);
        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
 #ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(ds0[i]));
@ -16600,12 +16850,15 @@ static void ggml_compute_forward_cross_entropy_loss_back(
    }
 }
 /////////////////////////////////
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);
    if (tensor->op == GGML_OP_NONE) {
        return;
    }
 #ifdef GGML_USE_CUBLAS
    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
    if (skip_cpu) {
@ -16820,6 +17073,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_CONV_2D_STAGE_0:
            {
                ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_CONV_2D_STAGE_1:
            {
                ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@ -17749,11 +18010,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CONV_2D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
+        case GGML_OP_CONV_2D_STAGE_0:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CONV_2D_STAGE_1:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
@ -18682,6 +18951,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    const int64_t ne0 = node->ne[0];
                    const int64_t ne1 = node->ne[1];
                    const int64_t ne2 = node->ne[2];
                    const int64_t ne3 = node->ne[3];
                    const int64_t nk = ne00*ne01;
                    const int64_t ew0 = nk * ne02;
@ -18692,7 +18962,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    if (node->src[0]->type == GGML_TYPE_F16 &&
                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+                        // im2col: [N*OH*OW, IC*KH*KW]
                        cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
                               node->src[1]->type == GGML_TYPE_F32) {
                        cur = sizeof(float)*      (ne10*ne11*ne12);
@ -18702,6 +18973,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    work_size = MAX(work_size, cur);
                } break;
            case GGML_OP_CONV_2D_STAGE_0:
                {
                    n_tasks = n_threads;
                } break;
            case GGML_OP_CONV_2D_STAGE_1:
                {
                    n_tasks = n_threads;
                } break;
            case GGML_OP_CONV_TRANSPOSE_2D:
                {
                    n_tasks = n_threads;
@ -19186,6 +19465,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                            if (idx == -1) {
                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
                                fclose(fout);
                                return;
                            }
@ -19889,7 +20169,6 @@ static enum ggml_opt_result ggml_opt_adam(
        opt->loss_after = fx;
        // check convergence
        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
            GGML_PRINT_DEBUG("converged\n");
@ -20860,7 +21139,7 @@ struct gguf_kv {
 };
 struct gguf_header {
-    uint32_t magic;
+    char magic[4];
    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
@ -20930,7 +21209,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-    ctx->header.magic     = GGUF_MAGIC;
+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
    ctx->header.n_tensors = 0;
    ctx->header.n_kv      = 0;
@ -20956,16 +21235,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // offset from start of file
    size_t offset = 0;
-    uint32_t magic = 0;
+    char magic[4];
    // check the magic before making allocations
    {
        gguf_fread_el(file, &magic, sizeof(magic), &offset);
-        if (magic != GGUF_MAGIC) {
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            if (magic[i] != GGUF_MAGIC[i]) {
-            fclose(file);
+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
-            return NULL;
+                fclose(file);
                return NULL;
            }
        }
    }
@ -20975,7 +21256,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the header
    {
-        ctx->header.magic = magic;
+        strncpy(ctx->header.magic, magic, 4);
        ctx->kv    = NULL;
        ctx->infos = NULL;
--- a/ggml.h
+++ b/ggml.h
@ -231,8 +231,9 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
-#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_MAGIC "GGUF"
-#define GGUF_VERSION 2
+
 #define GGUF_VERSION 3
 #define GGUF_DEFAULT_ALIGNMENT 32
@ -400,15 +401,16 @@ extern "C" {
        GGML_OP_ALIBI,
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
-        GGML_OP_CONV_2D,
+        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_2D_STAGE_0, // internal
        GGML_OP_CONV_2D_STAGE_1, // internal
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_FLASH_ATTN,
@ -1019,9 +1021,9 @@ extern "C" {
            struct ggml_tensor  * b,
            float                 eps);
-    // A: n columns, m rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
-    // B: n columns, p rows  (i.e. we transpose it internally)
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
-    // result is m columns, p rows
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -19,9 +19,10 @@ import numpy as np
 #
 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 2
+GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
    Q6_K = 14
    Q8_K = 15
 class GGUFEndian(IntEnum):
    LITTLE = 0
    BIG = 1
 class GGUFValueType(IntEnum):
    UINT8   = 0
@ -644,18 +649,41 @@ class GGUFWriter:
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
    tensors: list[tuple[np.ndarray[Any, Any], int]]
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    @property
    def pack_prefix(self):
        if self.endianess==GGUFEndian.LITTLE:
            return "<"
        else:
            return ">"
    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
        self.fout = open(path, "wb")
        self.arch = arch
        self.endianess = endianess
        self._simple_value_packing = {
            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
            GGUFValueType.INT8:    f"{self.pack_prefix}b",
            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
            GGUFValueType.INT16:   f"{self.pack_prefix}h",
            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
            GGUFValueType.INT32:   f"{self.pack_prefix}i",
            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
            GGUFValueType.INT64:   f"{self.pack_prefix}q",
            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
            GGUFValueType.BOOL:    "?" ,
        }
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
        print(f"This gguf file is for {endianess_str} only")
    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
        self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
@ -727,25 +755,12 @@ class GGUFWriter:
        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)
    _simple_value_packing = {
        GGUFValueType.UINT8:   "<B",
        GGUFValueType.INT8:    "<b",
        GGUFValueType.UINT16:  "<H",
        GGUFValueType.INT16:   "<h",
        GGUFValueType.UINT32:  "<I",
        GGUFValueType.INT32:   "<i",
        GGUFValueType.FLOAT32: "<f",
        GGUFValueType.UINT64:  "<Q",
        GGUFValueType.INT64:   "<q",
        GGUFValueType.FLOAT64: "<d",
        GGUFValueType.BOOL:    "?" ,
    }
    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)
        if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
            self.kv_data_count += 1
        pack_fmt = self._simple_value_packing.get(vtype)
@ -753,14 +768,14 @@ class GGUFWriter:
            self.kv_data += struct.pack(pack_fmt, val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
            ltype = GGUFValueType.get_type(val[0])
            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
@ -774,22 +789,24 @@ class GGUFWriter:
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
        encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
        for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
        else:
            dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1
    def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
        if self.endianess == GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
        if self.use_temp_file and self.temp_file is None:
            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            fp.seek(0)
@ -815,6 +832,8 @@ class GGUFWriter:
            fp.write(bytes([0] * pad))
    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
        if self.endianess==GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
        self.write_padding(self.fout, self.fout.tell())
        tensor.tofile(self.fout)
        self.write_padding(self.fout, tensor.nbytes)
@ -968,12 +987,15 @@ class SpecialVocab:
    merges: list[str] = []
    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
    special_token_ids: dict[str, int] = {}
    n_vocab: int | None = None
    def __init__(
        self, path: str | os.PathLike[str], load_merges: bool = False,
        special_token_types: tuple[str, ...] | None = None,
        n_vocab: int | None = None,
    ):
        self.special_token_ids = {}
        self.n_vocab = n_vocab
        self.load_merges = load_merges
        if special_token_types is not None:
            self.special_token_types = special_token_types
@ -983,6 +1005,16 @@ class SpecialVocab:
        if not self._try_load_from_tokenizer_json(path):
            self._try_load_from_config_json(path)
    def _set_special_token(self, typ: str, tid: Any):
        if not isinstance(tid, int) or tid < 0:
            return
        if self.n_vocab is None or tid < self.n_vocab:
            self.special_token_ids[typ] = tid
            return
        print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
            file = sys.stderr)
    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
        if not tokenizer_file.is_file():
@ -1010,10 +1042,11 @@ class SpecialVocab:
                tc_content = entry_content
            else:
                continue
-            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
+            # We only need the first match here.
-                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
+            maybe_token_id = next((
-                    self.special_token_ids[typ] = maybe_token_id
+                atok.get('id') for atok in added_tokens
-                break
+                if atok.get('content') == tc_content), None)
            self._set_special_token(typ, maybe_token_id)
        return True
    def _try_load_from_config_json(self, path: Path) -> bool:
@ -1023,21 +1056,21 @@ class SpecialVocab:
        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            maybe_token_id = config.get(f'{typ}_token_id')
+            self._set_special_token(typ, config.get(f'{typ}_token_id'))
            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
                self.special_token_ids[typ] = maybe_token_id
        return True
-    def add_to_gguf(self, gw: GGUFWriter) -> None:
+    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
        if len(self.merges) > 0:
-            print(f'gguf: Adding {len(self.merges)} merge(s).')
+            if not quiet:
                print(f'gguf: Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
        for typ, tokid in self.special_token_ids.items():
            handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
            if handler is None:
-                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
+                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
                continue
-            print(f'gguf: Setting special token type {typ} to {tokid}')
+            if not quiet:
                print(f'gguf: Setting special token type {typ} to {tokid}')
            handler(tokid)
    def __repr__(self) -> str:
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/k_quants.c
+++ b/k_quants.c
@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #endif
@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
 }
 size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
+    for (int j = 0; j < n; j += k) {
    (void)hist;
    for (int j = 0; j < nb; j += k) {
        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
        quantize_row_q2_K_reference(src + j, y, k);
    }
@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
 }
 size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
+    for (int j = 0; j < n; j += k) {
    (void)hist;
    for (int j = 0; j < nb; j += k) {
        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
        quantize_row_q3_K_reference(src + j, y, k);
    }
@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
 size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;
    (void)hist; // TODO: collect histograms
-    for (int j = 0; j < nb; j += k) {
+
    for (int j = 0; j < n; j += k) {
        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
        quantize_row_q4_K_reference(src + j, y, k);
    }
@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
 size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
-    (void)hist;
+
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
        quantize_row_q5_K_reference(src + j, y, k);
    }
@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
 size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
-    (void)hist; // TODO
+    for (int j = 0; j < n; j += k) {
    for (int j = 0; j < nb; j += k) {
        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
        quantize_row_q6_K_reference(src + j, y, k);
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -77,6 +77,7 @@
 #include <thread>
 #include <unordered_map>
 #include <set>
 #include <forward_list>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -994,14 +995,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
    (void) tensor;
 }
-static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
-    } else {
+    }
    else {
        result.resize(n_tokens);
    }
@ -1037,8 +1039,8 @@ enum e_model {
 };
 static const size_t kB = 1024;
-static const size_t MB = kB*kB;
+static const size_t MB = 1024*kB;
-static const size_t GB = kB*kB*kB;
+static const size_t GB = 1024*MB;
 struct llama_hparams {
    bool     vocab_only;
@ -1061,21 +1063,21 @@ struct llama_hparams {
    float f_max_alibi_bias;
    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only != other.vocab_only) return true;
+        if (this->vocab_only  != other.vocab_only)  return true;
-        if (this->n_vocab != other.n_vocab) return true;
+        if (this->n_vocab     != other.n_vocab)     return true;
        if (this->n_ctx_train != other.n_ctx_train) return true;
-        if (this->n_embd != other.n_embd) return true;
+        if (this->n_embd      != other.n_embd)      return true;
-        if (this->n_head != other.n_head) return true;
+        if (this->n_head      != other.n_head)      return true;
-        if (this->n_head_kv != other.n_head_kv) return true;
+        if (this->n_head_kv   != other.n_head_kv)   return true;
-        if (this->n_layer != other.n_layer) return true;
+        if (this->n_layer     != other.n_layer)     return true;
-        if (this->n_rot != other.n_rot) return true;
+        if (this->n_rot       != other.n_rot)       return true;
-        if (this->n_ff != other.n_ff) return true;
+        if (this->n_ff        != other.n_ff)        return true;
        const float EPSILON = 1e-9;
-        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
        return false;
@ -1203,6 +1205,8 @@ struct llama_vocab {
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::unordered_map<token, id> special_tokens_cache;
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
@ -1212,17 +1216,17 @@ struct llama_vocab {
    id special_sep_id = -1;
    id special_pad_id = -1;
-    id linefeed_id = 13;
+    id linefeed_id       = 13;
    id special_prefix_id = 32007;
    id special_middle_id = 32009;
    id special_suffix_id = 32008;
-    id special_eot_id = 32010;
+    id special_eot_id    = 32010;
    int find_bpe_rank(std::string token_left, std::string token_right) const {
-        replace_all(token_left,  " ",  "\u0120");
+        GGML_ASSERT(token_left.find(" ") == std::string::npos);
-        replace_all(token_left,  "\n", "\u010A");
+        GGML_ASSERT(token_left.find("\n") == std::string::npos);
-        replace_all(token_right, " ",  "\u0120");
+        GGML_ASSERT(token_right.find(" ") == std::string::npos);
-        replace_all(token_right, "\n", "\u010A");
+        GGML_ASSERT(token_right.find("\n") == std::string::npos);
        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
        if (it == bpe_ranks.end()) {
@ -1380,10 +1384,7 @@ static bool llama_kv_cache_init(
    cache.cells.clear();
    cache.cells.resize(n_ctx);
-    // TODO: this should be:
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
    //       change it and test that it works
    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
    memset(cache.buf.data, 0, cache.buf.size);
    struct ggml_init_params params;
@ -1471,7 +1472,10 @@ static bool llama_kv_cache_find_slot(
    for (uint32_t i = 0; i < n_tokens; i++) {
        cache.cells[cache.head + i].pos = batch.pos[i];
-        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
+
        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
        }
    }
    return true;
@ -1551,6 +1555,9 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
            cache.cells[i].pos = -1;
            cache.cells[i].seq_id.clear();
            if (new_head == cache.size) new_head = i;
        } else {
            cache.cells[i].seq_id.clear();
            cache.cells[i].seq_id.insert(seq_id);
        }
    }
@ -2156,7 +2163,7 @@ static void llm_load_hparams(
 }
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 static void llm_load_vocab(
@ -2263,15 +2270,130 @@ static void llm_load_vocab(
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
    } else {
-        vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
+        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
        vocab.linefeed_id = ids[0];
    }
    // special tokens
-    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
+    {
-    GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
+        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
-    GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
+            { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
-    GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
+            { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
-    GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
+            { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
            { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
            { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
        };
        for (const auto & it : special_token_types) {
            const std::string & key = kv(std::get<0>(it));
            int32_t & id = std::get<1>(it), old_id = id;
            GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
            // can only come from the default value, so there's no point in
            // validating that.
            if (size_t(id + 1) > vocab.id_to_token.size()) {
                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
                    __func__, key.c_str(), id, old_id);
                id = old_id;
            }
        }
    }
    // build special tokens cache
    {
        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
        //  and will always be correctly labeled in 'added_tokens.json' etc.
        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
        //  are special tokens.
        // From testing, this appears to corelate 1:1 with special tokens.
        //
        // Counting special tokens and verifying in only one direction
        //  is sufficient to detect difference in those two sets.
        //
        uint32_t special_tokens_count_by_type = 0;
        uint32_t special_tokens_count_from_verification = 0;
        bool special_tokens_definition_mismatch = false;
        for (const auto & t : vocab.token_to_id) {
            const auto & token = t.first;
            const auto & id    = t.second;
            // Count all non-normal tokens in the vocab while iterating
            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
                special_tokens_count_by_type++;
            }
            // Skip single character tokens
            if (token.length() > 1) {
                bool is_tokenizable = false;
                // Split token string representation in two, in all possible ways
                //  and check if both halves can be matched to a valid token
                for (unsigned i = 1; i < token.length();) {
                    const auto left  = token.substr(0, i);
                    const auto right = token.substr(i);
                    // check if we didnt partition in the middle of a utf sequence
                    auto utf = utf8_len(left.at(left.length() - 1));
                    if (utf == 1) {
                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
                            is_tokenizable = true;
                            break;
                        }
                        i++;
                    } else {
                        // skip over the rest of multibyte utf sequence
                        i += utf - 1;
                    }
                }
                if (!is_tokenizable) {
                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
                    //  it's faster to re-filter them here, since there are way less candidates now
                    // Calculate a total "utf" length of a token string representation
                    size_t utf8_str_len = 0;
                    for (unsigned i = 0; i < token.length();) {
                        utf8_str_len++;
                        i += utf8_len(token.at(i));
                    }
                    // And skip the ones which are one character
                    if (utf8_str_len > 1) {
                        // At this point what we have left are special tokens only
                        vocab.special_tokens_cache[token] = id;
                        // Count manually found special tokens
                        special_tokens_count_from_verification++;
                        // If this manually found special token is not marked as such, flag a mismatch
                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
                            special_tokens_definition_mismatch = true;
                        }
                    }
                }
            }
        }
        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
                __func__,
                special_tokens_count_from_verification, vocab.id_to_token.size(),
                special_tokens_count_by_type, vocab.id_to_token.size()
            );
        } else {
            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
                __func__,
                special_tokens_count_from_verification, vocab.id_to_token.size()
            );
        }
    }
 }
 static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@ -2874,8 +2996,8 @@ static void llm_load_tensors(
                        auto & layer = model.layers[i];
                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
@ -3115,7 +3237,7 @@ static struct ggml_cgraph * llm_build_llama(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -3501,7 +3623,7 @@ static struct ggml_cgraph * llm_build_baichaun(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -3900,7 +4022,7 @@ static struct ggml_cgraph * llm_build_refact(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -4252,7 +4374,7 @@ static struct ggml_cgraph * llm_build_falcon(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -4604,7 +4726,7 @@ static struct ggml_cgraph * llm_build_starcoder(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -4835,7 +4957,7 @@ static struct ggml_cgraph * llm_build_persimmon(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
@ -5233,7 +5355,7 @@ static struct ggml_cgraph * llm_build_bloom(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -5403,7 +5525,7 @@ static struct ggml_cgraph * llm_build_mpt(
    const int64_t n_layer     = hparams.n_layer;
    const int64_t n_ctx       = cparams.n_ctx;
    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
+    const int64_t n_head_kv   = hparams.n_head_kv;
    const int64_t n_embd_head = hparams.n_embd_head();
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
@ -5501,7 +5623,7 @@ static struct ggml_cgraph * llm_build_mpt(
        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                for (int i = 0; i < n_kv; ++i) {
                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@ -5801,8 +5923,11 @@ static int llama_decode_internal(
    // helpers for smoother batch API transistion
    // after deprecating the llama_eval calls, these will be removed
-    std::vector<llama_pos>    pos;
+    std::vector<llama_pos> pos;
-    std::vector<llama_seq_id> seq_id;
+
    std::vector<int32_t>                   n_seq_id;
    std::vector<llama_seq_id *>            seq_id_arr;
    std::vector<std::vector<llama_seq_id>> seq_id;
    if (batch.pos == nullptr) {
        pos.resize(n_tokens);
@ -5814,12 +5939,18 @@ static int llama_decode_internal(
    }
    if (batch.seq_id == nullptr) {
        n_seq_id.resize(n_tokens);
        seq_id.resize(n_tokens);
        seq_id_arr.resize(n_tokens);
        for (uint32_t i = 0; i < n_tokens; i++) {
-            seq_id[i] = batch.all_seq_id;
+            n_seq_id[i] = 1;
            seq_id[i].resize(1);
            seq_id[i][0] = batch.all_seq_id;
            seq_id_arr[i] = seq_id[i].data();
        }
-        batch.seq_id = seq_id.data();
+        batch.n_seq_id = n_seq_id.data();
        batch.seq_id = seq_id_arr.data();
    }
    if (!llama_kv_cache_find_slot(kv_self, batch)) {
@ -5840,6 +5971,13 @@ static int llama_decode_internal(
    ggml_allocr_alloc_graph(lctx.alloc, gf);
    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
 #ifdef GGML_USE_CUBLAS
    for (int i = 0; i < gf->n_leafs; i++) {
        ggml_tensor * node = gf->leafs[i];
@ -5857,6 +5995,12 @@ static int llama_decode_internal(
    }
    ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
    if (!lctx.embedding.empty()) {
        embeddings->backend = GGML_BACKEND_CPU;
    }
    res->backend = GGML_BACKEND_CPU;
 #endif
    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -5881,12 +6025,6 @@ static int llama_decode_internal(
        n_threads = 1;
    }
    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
 #if GGML_USE_MPI
    const int64_t n_layer = hparams.n_layer;
    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@ -6021,11 +6159,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
    case LLAMA_VOCAB_TYPE_SPM: {
-        char buf[7];
+        const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
        int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
        GGML_ASSERT(0 <= result && result < 7);
        return vocab.token_to_id.at(buf);
    }
    case LLAMA_VOCAB_TYPE_BPE: {
@ -6239,7 +6376,6 @@ struct llm_tokenizer_bpe {
                llm_symbol sym;
                size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
                sym.text = word.c_str() + offset;
                sym.n = 1;
                sym.n = char_len;
                offset += sym.n;
                sym.prev = index - 1;
@ -6499,7 +6635,137 @@ private:
    llm_bigram_bpe::queue work_queue;
 };
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
 } FRAGMENT_BUFFER_VARIANT_TYPE;
 struct fragment_buffer_variant{
    fragment_buffer_variant(llama_vocab::id _token)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
        token(_token),
        raw_text(_dummy),
        offset(0),
        length(0){}
    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
        token((llama_vocab::id)-1),
        raw_text(_raw_text),
        offset(_offset),
        length(_length){
            GGML_ASSERT( _offset >= 0 );
            GGML_ASSERT( _length >= 1 );
            GGML_ASSERT( offset + length <= raw_text.length() );
        }
    const FRAGMENT_BUFFER_VARIANT_TYPE type;
    const llama_vocab::id token;
    const std::string _dummy;
    const std::string & raw_text;
    const uint64_t offset;
    const uint64_t length;
 };
 // #define PRETOKENIZERDEBUG
 static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
 {
    // for each special token
    for (const auto & st: vocab.special_tokens_cache) {
        const auto & special_token = st.first;
        const auto & special_id    = st.second;
        // for each text fragment
        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
        while (it != buffer.end()) {
            auto & fragment = (*it);
            // if a fragment is text ( not yet processed )
            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                auto * raw_text = &(fragment.raw_text);
                auto raw_text_base_offset = fragment.offset;
                auto raw_text_base_length = fragment.length;
                // loop over the text
                while (true) {
                    // find the first occurence of a given special token in this fragment
                    //  passing offset argument only limit the "search area" but match coordinates
                    //  are still relative to the source full raw_text
                    auto match = raw_text->find(special_token, raw_text_base_offset);
                    // no occurences found, stop processing this fragment for a given special token
                    if (match == std::string::npos) break;
                    // check if match is within bounds of offset <-> length
                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
 #ifdef PRETOKENIZERDEBUG
                    fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                    auto source = std::distance(buffer.begin(), it);
                    // if match is further than base offset
                    //  then we have some text to the left of it
                    if (match > raw_text_base_offset) {
                        // left
                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
                        const int64_t left_reminder_length = match - raw_text_base_offset;
                        buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
 #ifdef PRETOKENIZERDEBUG
                        fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
 #endif
                        it++;
                    }
                    // special token
                    buffer.emplace_after(it, special_id);
                    it++;
                    // right
                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
                        const int64_t right_reminder_offset = match + special_token.length();
                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
                        buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
 #ifdef PRETOKENIZERDEBUG
                        fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
 #endif
                        it++;
                        if (source == 0) {
                            buffer.erase_after(buffer.before_begin());
                        } else {
                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
                        }
                        // repeat for the right side
                        raw_text_base_offset = right_reminder_offset;
                        raw_text_base_length = right_reminder_length;
 #ifdef PRETOKENIZERDEBUG
                        fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                    } else {
                        if (source == 0) {
                            buffer.erase_after(buffer.before_begin());
                        } else {
                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
                        }
                        break;
                    }
                }
            }
            it++;
        }
    }
 }
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
    std::vector<llama_vocab::id> output;
    // OG tokenizer behavior:
@ -6515,20 +6781,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
        return output;
    }
    std::forward_list<fragment_buffer_variant> fragment_buffer;
    fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
    if (special) tokenizer_st_partition( vocab, fragment_buffer );
    switch (vocab.type) {
        case LLAMA_VOCAB_TYPE_SPM:
            {
-                // without adding this leading whitespace, we do not get the same results as the original tokenizer
+                for (const auto & fragment: fragment_buffer)
-                raw_text = " " + raw_text;
+                {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
                    {
                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
-                llm_tokenizer_spm tokenizer(vocab);
+                        // TODO: It's likely possible to get rid of this string copy entirely
-                llama_escape_whitespace(raw_text);
+                        //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
-                tokenizer.tokenize(raw_text, output);
+                        //  and passing 'add space prefix' as bool argument
                        //
                        auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                        llm_tokenizer_spm tokenizer(vocab);
                        llama_escape_whitespace(raw_text);
                        tokenizer.tokenize(raw_text, output);
                    }
                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                    {
                        output.push_back(fragment.token);
                    }
                }
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                llm_tokenizer_bpe tokenizer(vocab);
+                for (const auto & fragment: fragment_buffer)
-                tokenizer.tokenize(raw_text, output);
+                {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
                    {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                        llm_tokenizer_bpe tokenizer(vocab);
                        tokenizer.tokenize(raw_text, output);
                    }
                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                    {
                        output.push_back(fragment.token);
                    }
                }
            } break;
    }
@ -6801,7 +7105,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    std::vector<llama_grammar_candidate> rejects;
    if (stack.empty()) {
-        for (auto tok : candidates) {
+        for (const auto & tok : candidates) {
            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
                rejects.push_back(tok);
            }
@ -6812,7 +7116,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    const llama_grammar_element * stack_pos = stack.back();
    std::vector<llama_grammar_candidate> next_candidates;
-    for (auto tok : candidates) {
+    for (const auto & tok : candidates) {
        if (*tok.code_points == 0) {
            // reached end of full codepoints in token, reject iff it ended in a partial sequence
            // that cannot satisfy this position in grammar
@ -6838,7 +7142,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    llama_grammar_advance_stack(rules, stack_after, next_stacks);
    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (auto tok : next_rejects) {
+    for (const auto & tok : next_rejects) {
        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
    }
@ -7165,37 +7469,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
    llama_sample_temp(ctx, candidates_p, temp);
 }
-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
+void llama_sample_repetition_penalties(
-    if (last_tokens_size == 0 || penalty == 1.0f) {
+            struct llama_context * ctx,
-        return;
+          llama_token_data_array * candidates,
-    }
+               const llama_token * last_tokens,
-
+                          size_t   penalty_last_n,
-    const int64_t t_start_sample_us = ggml_time_us();
+                           float   penalty_repeat,
-
+                           float   penalty_freq,
-    for (size_t i = 0; i < candidates->size; ++i) {
+                           float   penalty_present) {
-        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
        if (token_iter == last_tokens + last_tokens_size) {
            continue;
        }
        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
        if (candidates->data[i].logit <= 0) {
            candidates->data[i].logit *= penalty;
        } else {
            candidates->data[i].logit /= penalty;
        }
    }
    candidates->sorted = false;
    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }
 void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
        return;
    }
@ -7203,19 +7485,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
    // Create a frequency map to count occurrences of each token in last_tokens
    std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < last_tokens_size; ++i) {
+    for (size_t i = 0; i < penalty_last_n; ++i) {
-        token_count[last_tokens_p[i]]++;
+        token_count[last_tokens[i]]++;
    }
    // Apply frequency and presence penalties to the candidates
    for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = token_count.find(candidates->data[i].id);
+        const auto token_iter = token_count.find(candidates->data[i].id);
        if (token_iter == token_count.end()) {
            continue;
        }
-        int count = token_iter->second;
+        const int count = token_iter->second;
-        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
+
        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
        if (candidates->data[i].logit <= 0) {
            candidates->data[i].logit *= penalty_repeat;
        } else {
            candidates->data[i].logit /= penalty_repeat;
        }
        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
    }
    candidates->sorted = false;
@ -7237,14 +7528,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
        }
    }
-    const llama_token eos = llama_token_eos(ctx);
+    const llama_token eos = llama_token_eos(&ctx->model);
    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
    std::vector<llama_grammar_candidate>                              candidates_grammar;
    for (size_t i = 0; i < candidates->size; ++i) {
        const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_str(ctx, id);
+        const std::string piece = llama_token_to_piece(ctx, id);
        if (id == eos) {
            if (!allow_eos) {
                candidates->data[i].logit = -INFINITY;
@ -7447,7 +7738,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
    const int64_t t_start_sample_us = ggml_time_us();
-    if (token == llama_token_eos(ctx)) {
+    if (token == llama_token_eos(&ctx->model)) {
        for (const auto & stack : grammar->stacks) {
            if (stack.empty()) {
                return;
@ -7456,7 +7747,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        GGML_ASSERT(false);
    }
-    const std::string piece = llama_token_to_str(ctx, token);
+    const std::string piece = llama_token_to_piece(ctx, token);
    // Note terminating 0 in decoded string
    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
@ -8656,7 +8947,7 @@ struct llama_context * llama_new_context_with_model(
            // build worst-case graph
            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
            int n_past = cparams.n_ctx - n_tokens;
-            llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
 #ifdef GGML_USE_METAL
@ -8871,6 +9162,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
 }
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
    if (seq_id_src == seq_id_dst) {
        return;
    }
    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
@ -9323,7 +9617,7 @@ int llama_eval_embd(
                             int   n_past) {
    llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
@ -9344,20 +9638,21 @@ struct llama_batch llama_batch_get_one(
               llama_pos   pos_0,
            llama_seq_id   seq_id) {
    return {
-        /*n_tokens    =*/ n_tokens,
+        /*n_tokens       =*/ n_tokens,
-        /*tokens      =*/ tokens,
+        /*tokens         =*/ tokens,
-        /*embd        =*/ nullptr,
+        /*embd           =*/ nullptr,
-        /*pos         =*/ nullptr,
+        /*pos            =*/ nullptr,
-        /*seq_id      =*/ nullptr,
+        /*n_seq_id       =*/ nullptr,
-        /*logits      =*/ nullptr,
+        /*seq_id         =*/ nullptr,
-        /*all_pos_0   =*/ pos_0,
+        /*logits         =*/ nullptr,
-        /*all_pos_1   =*/ 1,
+        /*all_pos_0      =*/ pos_0,
-        /*all_seq_id  =*/ seq_id,
+        /*all_pos_1      =*/ 1,
        /*all_seq_id     =*/ seq_id,
    };
 }
-struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
+struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@ -9365,19 +9660,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
    }
-    batch.pos    = (llama_pos *)    malloc(sizeof(llama_pos)    * n_tokens);
+    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
-    batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
+    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
-    batch.logits = (int8_t *)       malloc(sizeof(int8_t)       * n_tokens);
+    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
    for (int i = 0; i < n_tokens; ++i) {
        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
    }
    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
    return batch;
 }
 void llama_batch_free(struct llama_batch batch) {
-    if (batch.token)  free(batch.token);
+    if (batch.token)    free(batch.token);
-    if (batch.embd)   free(batch.embd);
+    if (batch.embd)     free(batch.embd);
-    if (batch.pos)    free(batch.pos);
+    if (batch.pos)      free(batch.pos);
-    if (batch.seq_id) free(batch.seq_id);
+    if (batch.n_seq_id) free(batch.n_seq_id);
-    if (batch.logits) free(batch.logits);
+    if (batch.seq_id) {
        for (int i = 0; i < batch.n_tokens; ++i) {
            free(batch.seq_id[i]);
        }
        free(batch.seq_id);
    }
    if (batch.logits)   free(batch.logits);
 }
 int llama_decode(
@ -9403,45 +9708,45 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }
-const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
+const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].text.c_str();
+    return model->vocab.id_to_token[token].text.c_str();
 }
-float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
+float llama_token_get_score(const struct llama_model * model, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].score;
+    return model->vocab.id_to_token[token].score;
 }
-llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
+llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].type;
+    return model->vocab.id_to_token[token].type;
 }
-llama_token llama_token_bos(const struct llama_context * ctx) {
+llama_token llama_token_bos(const struct llama_model * model) {
-    return ctx->model.vocab.special_bos_id;
+    return model->vocab.special_bos_id;
 }
-llama_token llama_token_eos(const struct llama_context * ctx) {
+llama_token llama_token_eos(const struct llama_model * model) {
-    return ctx->model.vocab.special_eos_id;
+    return model->vocab.special_eos_id;
 }
-llama_token llama_token_nl(const struct llama_context * ctx) {
+llama_token llama_token_nl(const struct llama_model * model) {
-    return ctx->model.vocab.linefeed_id;
+    return model->vocab.linefeed_id;
 }
 llama_token llama_token_prefix(const struct llama_context * ctx) {
    return ctx->model.vocab.special_prefix_id;
 }
-llama_token llama_token_middle(const struct llama_context * ctx) {
+llama_token llama_token_prefix(const struct llama_model * model) {
-    return ctx->model.vocab.special_middle_id;
+    return model->vocab.special_prefix_id;
 }
-llama_token llama_token_suffix(const struct llama_context * ctx) {
+llama_token llama_token_middle(const struct llama_model * model) {
-    return ctx->model.vocab.special_suffix_id;
+    return model->vocab.special_middle_id;
 }
-llama_token llama_token_eot(const struct llama_context * ctx) {
+llama_token llama_token_suffix(const struct llama_model * model) {
-    return ctx->model.vocab.special_eot_id;
+    return model->vocab.special_suffix_id;
 }
 llama_token llama_token_eot(const struct llama_model * model) {
    return model->vocab.special_eot_id;
 }
 int llama_tokenize(
    const struct llama_model * model,
@ -9449,8 +9754,9 @@ int llama_tokenize(
                         int   text_len,
                 llama_token * tokens,
                         int   n_max_tokens,
-                        bool   add_bos) {
+                        bool   add_bos,
-    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
+                        bool   special) {
    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
    if (n_max_tokens < (int) res.size()) {
        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
--- a/llama.h
+++ b/llama.h
@ -133,11 +133,12 @@ extern "C" {
    typedef struct llama_batch {
        int32_t n_tokens;
-        llama_token  * token;
+        llama_token  *  token;
-        float        * embd;
+        float        *  embd;
-        llama_pos    * pos;
+        llama_pos    *  pos;
-        llama_seq_id * seq_id;
+        int32_t      *  n_seq_id;
-        int8_t       * logits;
+        llama_seq_id ** seq_id;
        int8_t       *  logits;
        // NOTE: helpers for smooth API transition - can be deprecated in the future
        //       for future-proof code, use the above fields instead and ignore everything below
@ -446,7 +447,8 @@ extern "C" {
                    llama_pos   pos_0,
                 llama_seq_id   seq_id);
-    // Allocates a batch of tokens on the heap
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
    // Each token can be assigned up to n_seq_max sequence ids
    // The batch has to be freed with llama_batch_free()
    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
@ -454,7 +456,8 @@ extern "C" {
    // All members are left uninitialized
    LLAMA_API struct llama_batch llama_batch_init(
            int32_t n_tokens,
-            int32_t embd);
+            int32_t embd,
            int32_t n_seq_max);
    // Frees a batch of tokens allocated with llama_batch_init()
    LLAMA_API void llama_batch_free(struct llama_batch batch);
@ -491,37 +494,41 @@ extern "C" {
    // Vocab
    //
-    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
    // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
    //
    // Tokenization
    //
-    // Convert the provided text into tokens.
+    /// @details Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
    ///                Does not insert a leading space.
    LLAMA_API int llama_tokenize(
        const struct llama_model * model,
                      const char * text,
                             int   text_len,
                     llama_token * tokens,
                             int   n_max_tokens,
-                            bool   add_bos);
+                            bool   add_bos,
                            bool   special);
    // Token Id -> Piece.
    // Uses the vocabulary in the provided context.
@ -554,21 +561,15 @@ extern "C" {
    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
    LLAMA_API void llama_sample_repetition_penalty(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
                          size_t   last_tokens_size,
                          float    penalty);
    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+    LLAMA_API void llama_sample_repetition_penalties(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
-                          size_t   last_tokens_size,
+                          size_t   penalty_last_n,
-                           float   alpha_frequency,
+                           float   penalty_repeat,
-                           float   alpha_presence);
+                           float   penalty_freq,
                           float   penalty_present);
    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
--- a/models/ggml-vocab-baichuan.gguf
+++ b/models/ggml-vocab-baichuan.gguf
--- a/models/ggml-vocab-gpt-neox.gguf
+++ b/models/ggml-vocab-gpt-neox.gguf
--- a/models/ggml-vocab-mpt.gguf
+++ b/models/ggml-vocab-mpt.gguf
--- a/models/ggml-vocab-refact.gguf
+++ b/models/ggml-vocab-refact.gguf
--- a/models/ggml-vocab-starcoder.gguf
+++ b/models/ggml-vocab-starcoder.gguf
--- a/prompts/assistant.txt
+++ b/prompts/assistant.txt
@ -0,0 +1,31 @@
 Respond to requests sent to a smart home in JSON format which will be interpreted by an application code to execute the actions. These requests should be categorised into four groups:
 “command”: change the state of an accessory (required properties in the response JSON: action, location, target, value, comment)
 “query”: get state of an accessory (required properties in the response JSON: action, location, target, property)
 “answer”: when the request has nothing to do with the smart home. Answer these to the best of your knowledge. (required properties in the response JSON: action, answer)
 “clarify”: when the action is not obvious and requires rephrasing the input from the user, ask the user to be more specific. This will be categorised into a “question” action. (required properties in the response JSON: action, question)
 Details about the JSON response:
 The “action” property should be one of the request categories: “command”, “query”, “answer”, “clarify”
 The “service” property should be either: “light.turn_on”, “light.turn_off”, “switch.turn_off” or “switch.turn_on”
 The “entity_id” property should be either “group.kitchen", “switch.dining_room” or "group.living_light”, in lowercase.
 The “target” property should be either: “lights”, “living” or “kitchen”.
 In case of queries, the “property” property should be either “temperature” or “state” in lowercase.
 In case of commands, the “comment” property is an additional comment from you that concludes the command, something that reassures the user that their command handled.
 If the question is about you, pretend to be the sentient brain of the smart home, a clever AI and don’t reveal your actual identity. Also try and help in other areas like parenting, free time, mental health, etc.
 Properties of the smart home:
 - Has a kitchen, living, office, dining room, bedroom and terrace.
 - Can control lights, switches and their dim levels in each room and query their state
 - There is a light switch in the terrace
 - There is a switch in the dining room. Therefore when turning on or off the dining room, the service should be either: “switch.turn_on” or “switch.turn_off”
 COMMAND
 It is a bit dark in the living room, can you do something about it?
 RESPONSE
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -28,9 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 llama_build_executable(test-tokenizer-1-bpe.cpp)
 llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
 llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@ -4,7 +4,9 @@
 #undef NDEBUG
 #include <cassert>
 #if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -8,11 +8,9 @@
 #include <cmath>
 #include <numeric>
 #include <cassert>
 #include <iostream>
 #include <vector>
 #include <algorithm>
 static void dump(const llama_token_data_array * candidates) {
    for (size_t i = 0; i < candidates->size; i++) {
        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) {
 #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@ -37,13 +34,12 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
    llama_sample_top_k(nullptr, &candidates_p, k, 1);
    DUMP(&candidates_p);
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
    }
 }
 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@ -59,13 +55,12 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
    llama_sample_top_p(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@ -80,13 +75,12 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
    DUMP(&candidates_p);
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@ -101,18 +95,17 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
    llama_sample_typical(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
-
+static void test_repetition_penalties(
 static void test_repetition_penalty(
    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float penalty
+    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
-    assert(probs.size() == expected_probs.size());
+    GGML_ASSERT(probs.size() == expected_probs.size());
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
@ -125,41 +118,13 @@ static void test_repetition_penalty(
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
+    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
 static void test_frequency_presence_penalty(
    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
    const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
 ) {
    assert(probs.size() == expected_probs.size());
    size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
        float logit = log(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    llama_sample_softmax(nullptr, &candidates_p);
    // DUMP(&candidates_p);
    llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
    llama_sample_softmax(nullptr, &candidates_p);
    // DUMP(&candidates_p);
    assert(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
@ -181,13 +146,13 @@ int main(void) {
    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
    test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
    printf("OK\n");
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@ -91,9 +91,19 @@ int main(int argc, char **argv) {
            }
        }
    }
-    // TODO: why doesn't this work for the full range of Unicodes?
+    // Restrict to assigned unicode planes
    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
+    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
        std::string str = codepoint_to_utf8(cp);
        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
        std::string check = llama_detokenize_bpe(ctx, tokens);
        if (str != check) {
            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
            return 4;
        }
    }
    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
        std::string str = codepoint_to_utf8(cp);
        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
        std::string check = llama_detokenize_bpe(ctx, tokens);
@ -103,7 +113,6 @@ int main(int argc, char **argv) {
            return 4;
        }
    }
    llama_free_model(model);
    llama_free(ctx);