Merge branch 'ggerganov:master' into iq2_s

2024-03-17 11:23:09 +05:30 · 2024-03-17 11:23:09 +05:30 · f3a3ea1ff3
commit f3a3ea1ff3
parent 87e5c86686 c47cf414ef
39 changed files with 1928 additions and 496 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -333,6 +333,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
@ -361,6 +362,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "42 0 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -25,17 +25,14 @@ jobs:
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
        include:
          - build_type: Release
            sanitizer: ""
-        exclude:
-          - build_type: Release
-            sanitizer: ADDRESS
-          - build_type: Release
+          - build_type: Debug
            sanitizer: THREAD
-          - build_type: Release
-            sanitizer: UNDEFINED
+            disabled_on_pr: true
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

    container:
      image: ubuntu:latest
@ -81,13 +78,14 @@ jobs:

      - name: Tests
        id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh

      - name: Slow tests
        id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
@ -124,13 +122,14 @@ jobs:

      - name: Tests
        id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
        id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          behave.exe --stop --no-skipped --no-capture --tags slow
--- a/.gitignore
+++ b/.gitignore
@ -25,6 +25,8 @@
 .vscode/
 .idea/

+ggml-metal-embed.metal
+
 lcov-report/
 gcovr-report/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -200,9 +200,6 @@ if (LLAMA_METAL)
        add_compile_definitions(GGML_METAL_NDEBUG)
    endif()

-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
    # copy ggml-common.h and ggml-metal.metal to bin directory
    configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
@ -211,53 +208,62 @@ if (LLAMA_METAL)
        enable_language(ASM)
        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)

+        set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
        set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+
        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
+
+        # merge ggml-common.h and ggml-metal.metal into a single file
+        set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+        set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")

        add_custom_command(
-            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-            DEPENDS ${METALLIB_SOURCE}
+            OUTPUT ${METALLIB_EMBED_ASM}
+            COMMAND echo "Embedding Metal library"
+            COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+            COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+            DEPENDS ggml-metal.metal ggml-common.h
            COMMENT "Generate assembly for embedded Metal library"
        )

-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-    endif()
-
-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
+    else()
+        if (LLAMA_METAL_SHADER_DEBUG)
+            # custom command to do the following:
+            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+            #
+            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+            #       disabling fast math is needed in order to pass tests/test-backend-ops
+            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+            set(XC_FLAGS -fno-fast-math -fno-inline -g)
+        else()
+            set(XC_FLAGS -O3)
        endif()

        add_custom_command(
            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+            DEPENDS ggml-metal.metal ggml-common.h
            COMMENT "Compiling Metal kernels"
-        )
+            )

        add_custom_target(
            ggml-metal ALL
            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
+            )
+    endif() # LLAMA_METAL_EMBED_LIBRARY

    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
--- a/17
+++ b/17
@ -553,19 +553,20 @@ endif
 endif # LLAMA_METAL

 ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
+ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@

 ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal
+ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 	@echo "Embedding Metal library"
+	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
+	@echo ".section __DATA, __ggml_metallib"   >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"        >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"              >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"          >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
--- a/README.md
+++ b/README.md
@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)

 **Multimodal models:**

@ -133,6 +134,7 @@ Typically finetunes of the base models below are supported as well.
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
 extern char const *LLAMA_COMPILER;
 extern char const *LLAMA_BUILD_TARGET;

+struct llama_control_vector_load_info;
+
+int32_t get_num_physical_cores();
+
 //
 // CLI argument parsing
 //
-int32_t get_num_physical_cores();

 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@ -103,6 +106,11 @@ struct gpt_params {
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter

+    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
+
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
@ -268,3 +276,25 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40

 void llama_embd_normalize(const float * inp, float * out, int n);

+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct llama_control_vector_data {
+    int n_embd;
+
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+
+struct llama_control_vector_load_info {
+    float strength;
+
+    std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1965,6 +1965,23 @@ class MambaModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+@Model.register("CohereForCausalLM")
+class CommandR2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COMMAND_R
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
 ###### CONVERSION LOGIC ######


--- a/convert.py
+++ b/convert.py
@ -332,6 +332,9 @@ class Params:
 #

 class BpeVocab:
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        if isinstance(self.bpe_tokenizer.get('model'), dict):
@ -390,6 +393,9 @@ class BpeVocab:


 class SentencePieceVocab:
+    tokenizer_model = "llama"
+    name = "spm"
+
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
@ -453,6 +459,9 @@ class SentencePieceVocab:


 class HfVocab:
+    tokenizer_model = "llama"
+    name = "hfft"
+
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
        try:
            from transformers import AutoTokenizer
@ -553,7 +562,15 @@ class HfVocab:
        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
+class NoVocab:
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"


 #
@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
    # Handle special case where the model's vocab size is not set
    if params.n_vocab == -1:
        raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
+            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
        )
+    if isinstance(vocab, NoVocab):
+        return  # model has no vocab

    # Check for a vocab size mismatch
    if params.n_vocab == vocab.vocab_size:
@ -977,6 +996,7 @@ class OutputFile:
            name = str(params.path_model.parent).split('/')[-1]

        self.gguf.add_name                (name)
+        self.gguf.add_vocab_size          (params.n_vocab)
        self.gguf.add_context_length      (params.n_ctx)
        self.gguf.add_embedding_length    (params.n_embd)
        self.gguf.add_block_count         (params.n_layer)
@ -1013,21 +1033,9 @@ class OutputFile:
        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)

-    def handle_tokenizer_model(self, vocab: Vocab) -> str:
-        # Map the vocab types to the supported tokenizer models
-        tokenizer_model = {
-            SentencePieceVocab: "llama",
-            HfVocab: "llama",
-            BpeVocab: "gpt2",
-        }.get(type(vocab))
-
-        # Block if vocab type is not predefined
-        if tokenizer_model is None:
-            raise ValueError("Unknown vocab type: Not supported")
-
-        return tokenizer_model
-
    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
+        assert not isinstance(vocab, NoVocab)
+
        tokens = []
        scores = []
        toktypes = []
@ -1043,11 +1051,8 @@ class OutputFile:
        return tokens, scores, toktypes

    def add_meta_vocab(self, vocab: Vocab) -> None:
-        # Handle the tokenizer model
-        tokenizer_model = self.handle_tokenizer_model(vocab)
-
        # Ensure that tokenizer_model is added to the GGUF model
-        self.gguf.add_tokenizer_model(tokenizer_model)
+        self.gguf.add_tokenizer_model(vocab.tokenizer_model)

        # Extract model vocabulary for model conversion
        tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@ -1074,6 +1079,26 @@ class OutputFile:
    def write_tensor_info(self) -> None:
        self.gguf.write_ti_data_to_file()

+    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
+        if ftype == GGMLFileType.MostlyQ8_0:
+            ndarrays = bounded_parallel_map(
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
+                use_processpool_executor=True,
+            )
+        else:
+            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
+
+        start = time.time()
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            elapsed = time.time() - start
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(
+                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
+            )
+            self.gguf.write_tensor_data(ndarray)
+
    def close(self) -> None:
        self.gguf.close()

@ -1082,7 +1107,7 @@ class OutputFile:
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@ -1120,8 +1145,11 @@ class OutputFile:

        # meta data
        of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
-        of.add_meta_special_vocab(svocab)
+        if isinstance(vocab, NoVocab):
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
+        else:
+            of.add_meta_vocab(vocab)
+            of.add_meta_special_vocab(svocab)

        # tensor info
        for name, lazy_tensor in model.items():
@ -1131,24 +1159,7 @@ class OutputFile:
        of.write_tensor_info()

        # tensor data
-        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
-        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
-                use_processpool_executor=True,
-            )
-        else:
-            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
-
-        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            elapsed = time.time() - start
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
-            padi = len(str(len(model)))
-            print(
-                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
-            )
-            of.gguf.write_tensor_data(ndarray)
+        of.write_tensor_data(ftype, model, concurrency)

        of.close()

@ -1309,8 +1320,8 @@ class VocabFactory:
                return vtype, path
        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")

-    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
-        load_merges = vocabtype == "bpe"
+    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
+        load_merges = vocab.name == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
            model_parent_path,
@ -1319,30 +1330,34 @@ class VocabFactory:
            n_vocab=n_vocab,
        )

-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
        vocab_type, path = self._select_file(vocab_types)
        print(f"Loading vocab file {path!r}, type {vocab_type!r}")

        added_tokens_path = path.parent / "added_tokens.json"
-        vocab: Vocab
        if vocab_type == "bpe":
-            vocab = BpeVocab(
+            return BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "spm":
-            vocab = SentencePieceVocab(
+        if vocab_type == "spm":
+            return SentencePieceVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "hfft":
-            vocab = HfVocab(
+        if vocab_type == "hfft":
+            return HfVocab(
                path.parent, added_tokens_path if added_tokens_path.exists() else None
            )
+        raise ValueError(vocab_type)
+
+    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        vocab: Vocab
+        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
+            vocab = NoVocab()
        else:
-            raise ValueError(vocab_type)
+            vocab = self._create_vocab_by_path(vocab_types)
        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
-            vocab_type,
            model_parent_path,
        )
        return vocab, special_vocab
@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
+    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")

    args = parser.parse_args(args_in)
+    if args.no_vocab:
+        if args.vocab_only:
+            raise ValueError("no need to specify --vocab-only if using --no-vocab")
+        args.vocab_type = "no_vocab"

    if args.dump_single:
        model_plus = lazy_load_file(args.model)
@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
        print(f"Wrote {outfile}")
        return

-    if model_plus.vocab is not None and args.vocab_dir is None:
+    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab

    print(f"Vocab info: {vocab}")
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
            inp.resize(n_batch);
        }
        inputs.push_back(inp);
    }

+    // add eos if not present
+    for (auto & inp : inputs) {
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+    }
+
    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) inputs.size(); i++) {
@ -167,15 +174,26 @@ int main(int argc, char ** argv) {
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);

-    // print first 3 embeddings
-    for (int j = 0; j < std::min(3, n_prompts); j++) {
-        fprintf(stderr, "embedding %d: ", j);
-        for (int i = 0; i < n_embd; i++) {
-            fprintf(stderr, "%f ", emb[j * n_embd + i]);
+    // print the first part of the embeddings
+    fprintf(stdout, "\n");
+    for (int j = 0; j < n_prompts; j++) {
+        fprintf(stdout, "embedding %d: ", j);
+        for (int i = 0; i < std::min(16, n_embd); i++) {
+            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
        }
-        fprintf(stderr, "\n\n");
+        fprintf(stdout, "\n");
+    }
+
+    // print cosine similarity matrix
+    fprintf(stdout, "\n");
+    printf("cosine similarity matrix:\n\n");
+    for (int i = 0; i < n_prompts; i++) {
+        for (int j = 0; j < n_prompts; j++) {
+            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+            fprintf(stdout, "%6.2f ", sim);
+        }
+        fprintf(stdout, "\n");
    }
-    fprintf(stderr, "\n");

    // clean up
    llama_print_timings(ctx);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        gguf_free(ctx);
                        return false;
                    }
                }
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+```
+
+Run the example using the downloaded model:
+```console
+$ ./gritlm -m gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -6,22 +6,6 @@

 // #define GRIT_DEBUG

-static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
-    float dot = 0.0f;
-    for (uint64_t i = 0; i < v1.size(); ++i) {
-        dot += v1[i] * v2[i];
-    }
-    return dot;
-}
-
-static float norm(const std::vector<float> & v) {
-    return std::sqrt(dot_product(v, v));
-}
-
-static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
-    return dot_product(v1, v2) / (norm(v1) * norm(v2));
-}
-
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
-        const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
-        const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
-        const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+        const int n_embd = llama_n_embd(mdl);
+
+        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);

        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -8,6 +8,7 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <numeric>
@ -103,6 +104,7 @@ static std::string get_cpu_info() {
                }
            }
        }
+        fclose(f);
    }
 #endif
    // TODO: other platforms
@ -1122,15 +1124,19 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

-    //std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
-    //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
-    //GGML_UNUSED(n_batch);
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    std::vector<llama_token> tokens(n_batch);

-    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
    int n_processed = 0;

    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+        for (int i = 1; i < n_tokens; i++) {
+            tokens[i] = std::rand() % n_vocab;
+        }
        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
        n_processed += n_tokens;
    }
@ -1141,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

-    llama_token token = llama_token_bos(llama_get_model(ctx));
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;

    for (int i = 0; i < n_gen; i++) {
        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
        llama_synchronize(ctx);
+        token = std::rand() % n_vocab;
    }
 }

--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
-2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+
+4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```

-4) Create the visual gguf model:
+5) Create the visual gguf model:
 ```console
 python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

-5) Then convert the model to gguf format:
+6) Then convert the model to gguf format:
 ```console
 python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

-6) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava-cli using the 1.6 model version:
 ```console
 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        if (!new_clip->ctx_data) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
+            gguf_free(ctx);
            return nullptr;
        }

@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        if (!fin) {
            printf("cannot open model file for loading tensors\n");
            clip_free(new_clip);
+            gguf_free(ctx);
            return nullptr;
        }

@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            if (!fin) {
                printf("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
+                gguf_free(ctx);
                return nullptr;
            }
            int num_bytes = ggml_nbytes(cur);
@ -1232,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {

 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }

@ -1494,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im

 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
@ -1506,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        pad_to_square = false;
    }
    // free the previous res_imgs if any set
-    if (res_imgs.size > 0) {
+    if (res_imgs->size > 0) {
        clip_image_f32_batch_free(res_imgs);
    }
-    res_imgs.data = nullptr;
-    res_imgs.size = 0;
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@ -1565,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            patches.insert(patches.begin(), image_original_resize);
            // clip_image_f32_batch_init(patches.size());
-            res_imgs.size = patches.size();
-            res_imgs.data = new clip_image_f32[res_imgs.size];
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
            int num=0;
            for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
                num++;
            }

@ -1657,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
    // }
    // res_imgs.push_back(res);

-    res_imgs.size = 1;
-    res_imgs.data = new clip_image_f32[res_imgs.size];
-    res_imgs.data[0] = *res;
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
    clip_image_f32_free(res);

    return true;
@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                break;
            default:
                printf("Please use an input file in f32 or f16\n");
+                gguf_free(ctx_out);
                return false;
            }

--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();

 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    clip_image_f32_batch img_res_v;
    img_res_v.size = 0;
    img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@ -29,9 +29,9 @@ struct llava_image_embed {
 };

 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);

-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);

 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -119,6 +119,10 @@ def step_server_metrics(context):
 def step_start_server(context):
    start_server_background(context)
    attempts = 0
+    max_attempts = 20
+    if 'GITHUB_ACTIONS' in os.environ:
+        max_attempts *= 2
+
    while True:
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            result = sock.connect_ex((context.server_fqdn, context.server_port))
@ -126,7 +130,7 @@ def step_start_server(context):
                print("\x1b[33;46mserver started!\x1b[0m")
                return
            attempts += 1
-            if attempts > 20:
+            if attempts > max_attempts:
                assert False, "server not started"
            print(f"waiting for server to start, connect error code = {result}...")
            time.sleep(0.1)
@ -943,6 +947,9 @@ async def wait_for_health_status(context,
        print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
    interval = 0.5
    counter = 0
+    if 'GITHUB_ACTIONS' in os.environ:
+        timeout *= 2
+
    async with aiohttp.ClientSession() as session:
        while True:
            async with await session.get(f'{base_url}/health', params=params) as health_response:
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

-#build example/main only
+#build example/main
 #cmake --build . --config Release --target main

+#build example/llama-bench
+#cmake --build . --config Release --target llama-bench
+
 #build all binary
 cmake --build . --config Release -v
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
+    GGML_SYCL_SINGLE_GPU=1
 else
    GGML_SYCL_DEVICE=0
 fi
-echo "use $GGML_SYCL_DEVICE as main GPU"
+
 #export GGML_SYCL_DEBUG=1


 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.

-#use all GPUs with same max compute units
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
+    echo "use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+fi

 #use main GPU only
 #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

+#use multiple GPUs with same max compute units
+#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *

    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);

+    gguf_free(fctx);
    return true;
 }

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
    if (ggml_backend_is_cuda(event->backend)) {
        CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
    } else {
+#if 0
        // untested
        auto wait_fn = [](void * user_data) {
            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
        };

        CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
+#endif
+        GGML_ASSERT(false);
    }
 }

--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    id<MTLLibrary> metal_library;

    // load library
+    //
+    // - first check if the library is embedded
+    // - then check if the library is in the bundle
+    // - if not found, load the source and compile it
+    // - if that fails, return NULL
    {
        NSBundle * bundle = nil;
 #ifdef SWIFT_PACKAGE
@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #else
        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
+
        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
+
+#if GGML_METAL_EMBED_LIBRARY
+        const bool try_metallib = false;
+#else
+        const bool try_metallib = true;
+#endif
+
+        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
+        if (try_metallib && path_lib != nil) {
            // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
+
            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
            if (error) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            extern const char ggml_metallib_start[];
            extern const char ggml_metallib_end[];

-            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

-            NSString * sourcePath;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
+            NSString * path_source;
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];

-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
+            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");

-            if (ggmlMetalPathResources) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
+            if (path_resource) {
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
            } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
            }
-            if (sourcePath == nil) {
+
+            if (path_source == nil) {
                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal.metal";
+                path_source = @"ggml-metal.metal";
            }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
+
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+
+            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
            if (error) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
-#endif
+#endif // GGML_METAL_EMBED_LIBRARY

            @autoreleasepool {
                // dictionary of preprocessor macros
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -4,9 +4,6 @@

 #include <metal_stdlib>

-#define GGML_COMMON_IMPL_METAL
-#include "ggml-common.h"
-
 using namespace metal;

 #define MAX(x, y) ((x) > (y) ? (x) : (y))
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -16,6 +16,7 @@
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <float.h>
 #include <limits>
 #include <stdint.h>
@ -24,10 +25,9 @@
 #include <cmath>
 #include <iostream>
 #include <fstream>
-
 #include <stdio.h>
 #include <stdlib.h>
-
+#include <regex>

 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
 #define __dpct_noinline__ __attribute__((noinline))
 #endif

+
+std::string get_device_type_name(const sycl::device &Device) {
+    auto DeviceType = Device.get_info<sycl::info::device::device_type>();
+    switch (DeviceType) {
+    case sycl::info::device_type::cpu:
+        return "cpu";
+    case sycl::info::device_type::gpu:
+        return "gpu";
+    case sycl::info::device_type::host:
+        return "host";
+    case sycl::info::device_type::accelerator:
+        return "acc";
+    default:
+        return "unknown";
+    }
+}
+
+std::string get_device_backend_and_type(const sycl::device &device) {
+    std::stringstream device_type;
+    sycl::backend backend = device.get_backend();
+    device_type <<  backend << ":" << get_device_type_name(device);
+    return device_type.str();
+}
+
 namespace dpct
 {
    typedef sycl::queue *queue_ptr;
@ -942,17 +966,65 @@ namespace dpct

    private:
        mutable std::recursive_mutex m_mutex;
+        static bool compare_dev(sycl::device &device1, sycl::device &device2)
+        {
+            dpct::device_info prop1;
+            dpct::get_device_info(prop1, device1);
+            dpct::device_info prop2;
+            dpct::get_device_info(prop2, device2);
+            return prop1.get_max_compute_units() > prop2.get_max_compute_units();
+        }
+        static int convert_backend_index(std::string & backend) {
+            if (backend == "ext_oneapi_level_zero:gpu") return 0;
+            if (backend == "opencl:gpu") return 1;
+            if (backend == "opencl:cpu") return 2;
+            if (backend == "opencl:acc") return 3;
+            printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
+            GGML_ASSERT(false);
+        }
+        static bool compare_backend(std::string &backend1, std::string &backend2) {
+            return convert_backend_index(backend1) < convert_backend_index(backend2);
+        }
        dev_mgr()
        {
            sycl::device default_device =
                sycl::device(sycl::default_selector_v);
            _devs.push_back(std::make_shared<device_ext>(default_device));

-            std::vector<sycl::device> sycl_all_devs =
-                sycl::device::get_devices(sycl::info::device_type::all);
+            std::vector<sycl::device> sycl_all_devs;
            // Collect other devices except for the default device.
            if (default_device.is_cpu())
                _cpu_device = 0;
+
+            auto Platforms = sycl::platform::get_platforms();
+            // Keep track of the number of devices per backend
+            std::map<sycl::backend, size_t> DeviceNums;
+            std::map<std::string, std::vector<sycl::device>> backend_devices;
+
+            while (!Platforms.empty()) {
+                auto Platform = Platforms.back();
+                Platforms.pop_back();
+                auto devices = Platform.get_devices();
+                std::string backend_type = get_device_backend_and_type(devices[0]);
+                for (const auto &device : devices) {
+                    backend_devices[backend_type].push_back(device);
+                }
+            }
+
+            std::vector<std::string> keys;
+            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+                keys.push_back(it->first);
+            }
+            std::sort(keys.begin(), keys.end(), compare_backend);
+
+            for (auto &key : keys) {
+                std::vector<sycl::device> devs = backend_devices[key];
+                std::sort(devs.begin(), devs.end(), compare_dev);
+                for (const auto &dev : devs) {
+                    sycl_all_devs.push_back(dev);
+                }
+            }
+
            for (auto &dev : sycl_all_devs)
            {
                if (dev == default_device)
@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
 #define GGML_SYCL_MMV_Y 1
 #endif

+enum ggml_sycl_backend_gpu_mode {
+    SYCL_UNSET_GPU_MODE = -1,
+    SYCL_SINGLE_GPU_MODE = 0,
+    SYCL_MUL_GPU_MODE
+};

 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");

@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
        int work_group_size = 0;
        std::string gpus_list = "";

+        /*
+        Use all GPUs with same top max compute units
+        */
        sycl_gpu_mgr() {
            detect_sycl_gpu_list_with_max_cu();
            get_allow_gpus();
            create_context_with_gpus();
        }

+        /*
+        Only use the assigned GPU
+        */
+        sycl_gpu_mgr(int main_gpu_id) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            gpus.push_back(main_gpu_id);
+            devices.push_back(device);
+            work_group_size = prop.get_max_work_group_size();
+            max_compute_units = prop.get_max_compute_units();
+
+            get_allow_gpus();
+            create_context_with_gpus();
+        }
+
        void create_context_with_gpus() {
            sycl::context ctx = sycl::context(devices);
            assert(gpus.size() > 0);
@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
                gpus_list += std::to_string(gpus[i]);
                gpus_list += ",";
            }
-            if (gpus_list.length() > 2) {
+            if (gpus_list.length() > 1) {
                gpus_list.pop_back();
            }
        }
@ -3451,7 +3547,7 @@ class sycl_gpu_mgr {
                dpct::device_info prop;
                dpct::get_device_info(prop, device);
                if (max_compute_units == prop.get_max_compute_units() &&
-                    prop.get_major_version() == 1) {
+                    is_ext_oneapi_device(device)) {
                    gpus.push_back(id);
                    devices.push_back(device);
                    work_group_size = prop.get_max_work_group_size();
@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
                if (gpus[i] == id)
                    return i;
            }
-            assert(false);
-            return -1;
+            printf("miss to get device index by id=%d\n", id);
+            GGML_ASSERT(false);
        }

        int get_next_index(int id) {
@ -3481,8 +3577,16 @@ class sycl_gpu_mgr {
                if (gpus[i] == id)
                    return i;
            }
-            assert(false);
-            return -1;
+            GGML_ASSERT(false);
+        }
+
+        bool is_ext_oneapi_device(const sycl::device &dev) {
+            sycl::backend dev_backend = dev.get_backend();
+            if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+                dev_backend == sycl::backend::ext_oneapi_cuda ||
+                dev_backend == sycl::backend::ext_oneapi_hip)
+                return true;
+            return false;
        }
 };

@ -3491,11 +3595,14 @@ static int g_device_count = -1;
 static int g_all_sycl_device_count = -1;
 static int g_main_device = -1;
 static int g_main_device_id = -1;
+static bool g_ggml_backend_sycl_buffer_type_initialized = false;

 static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};

 static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};

+static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
+
 struct sycl_device_capabilities {
    int     cc;                 // compute capability
    bool    vmm;                // virtual memory support
@ -13190,17 +13297,20 @@ bool ggml_sycl_loaded(void) {
    return g_sycl_loaded;
 }

-void print_device_detail(int id) {
+void print_device_detail(int id, sycl::device &device, std::string device_type) {
+
    dpct::device_info prop;
    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
-    sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
+        dpct::get_device_info(prop, device)));
+
    std::string version;
    version += std::to_string(prop.get_major_version());
    version += ".";
    version += std::to_string(prop.get_minor_version());

-    fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+
+    fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
            prop.get_name(), version.c_str(), prop.get_max_compute_units(),
            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
            prop.get_global_mem_size());
@ -13208,19 +13318,35 @@ void print_device_detail(int id) {

 void ggml_backend_sycl_print_sycl_devices() {
    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|ID| Name                                        |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
-    fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
+    fprintf(stderr, "|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |\n");
+    fprintf(stderr, "|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|\n");
+    fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
    for (int id = 0; id < device_count; ++id) {
-        print_device_detail(id);
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail(id, device, device_type.str());
    }
 }

 void print_gpu_device_list() {
-    fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
-            g_sycl_gpu_mgr->get_gpu_count(),
-            g_sycl_gpu_mgr->gpus_list.c_str(),
-            g_sycl_gpu_mgr->max_compute_units);
+    GGML_ASSERT(g_sycl_gpu_mgr);
+
+    char* hint=NULL;
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
+        hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
+    } else {
+        hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
+    }
+    fprintf(stderr, hint,
+        g_sycl_gpu_mgr->get_gpu_count(),
+        g_sycl_gpu_mgr->gpus_list.c_str(),
+        g_sycl_gpu_mgr->max_compute_units);
 }

 int get_sycl_env(const char *env_name, int default_val) {
@ -13256,23 +13382,6 @@ void ggml_init_sycl() try {
 #else
        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
 #endif
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-
-        if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-
-        g_device_count = g_sycl_gpu_mgr->get_gpu_count();
-        g_work_group_size = g_sycl_gpu_mgr->work_group_size;
-
-        print_gpu_device_list();
-
-        int64_t total_vram = 0;

 /* NOT REMOVE, keep it for next optimize for XMX.
 #if defined(SYCL_USE_XMX)
@ -13281,49 +13390,15 @@ void ggml_init_sycl() try {
        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 */
-        for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
-            g_device_caps[id].vmm = 0;
-            g_device_caps[id].device_id = -1;
-            g_device_caps[id].cc = 0;
-            g_tensor_split[id] = 0;
-            g_default_tensor_split[id] = 0;
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                            dpct::dev_mgr::instance().device_count()) != 0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
        }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            int device_id = g_sycl_gpu_mgr->gpus[i];
-            g_device_caps[i].vmm = 0;
-
-            dpct::device_info prop;
-            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(device_id))));
-
-            g_default_tensor_split[i] = total_vram;
-            total_vram += prop.get_global_mem_size();
-
-            g_device_caps[i].cc =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-        }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            g_default_tensor_split[i] /= total_vram;
-        }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            SYCL_CHECK(ggml_sycl_set_device(i));
-
-            // create sycl streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_syclStreams[i][is] =
-                        dpct::get_current_device().create_queue(
-                            g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
-            }
-
-            const dpct::queue_ptr stream = g_syclStreams[i][0];
-            // create sycl handle
-            SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
-        }
-
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
        initialized = true;
        g_sycl_loaded = true;
    }
@ -13334,6 +13409,63 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

+void ggml_init_by_gpus(int device_count) try {
+    g_device_count = device_count;
+    g_work_group_size = g_sycl_gpu_mgr->work_group_size;
+
+    int64_t total_vram = 0;
+
+    print_gpu_device_list();
+
+    for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
+        g_device_caps[id].vmm = 0;
+        g_device_caps[id].device_id = -1;
+        g_device_caps[id].cc = 0;
+        g_tensor_split[id] = 0;
+        g_default_tensor_split[id] = 0;
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        int device_id = g_sycl_gpu_mgr->gpus[i];
+        g_device_caps[i].vmm = 0;
+
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(device_id))));
+
+        g_default_tensor_split[i] = total_vram;
+        total_vram += prop.get_global_mem_size();
+
+        g_device_caps[i].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        g_default_tensor_split[i] /= total_vram;
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        SYCL_CHECK(ggml_sycl_set_device(i));
+
+        // create sycl streams
+        for (int is = 0; is < MAX_STREAMS; ++is) {
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                g_syclStreams[i][is] =
+                    dpct::get_current_device().create_queue(
+                        g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
+        }
+
+        const dpct::queue_ptr stream = g_syclStreams[i][0];
+        // create sycl handle
+        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
 void *ggml_sycl_host_malloc(size_t size) try {
    if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
        return nullptr;
@ -16737,22 +16869,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
    /* .is_host          = */ nullptr,
 };

-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
+    if (device_index>=g_device_count or device_index<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device_index, g_device_count-1);
+        GGML_ASSERT(device_index<g_device_count);
+    }
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
+    if (!g_ggml_backend_sycl_buffer_type_initialized) {
        for (int i = 0; i < g_device_count; i++) {
            ggml_backend_sycl_buffer_types[i] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
            };
        }
-        ggml_backend_sycl_buffer_type_initialized = true;
+        g_ggml_backend_sycl_buffer_type_initialized = true;
    }
-
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_index];
 }

 // sycl split buffer type
@ -17504,11 +17638,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
    return g_sycl_gpu_mgr->get_index(device_id);
 }

+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
+    return g_sycl_gpu_mgr->gpus[device_index];
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
+    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
+    g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
+        return;
+    }
+
+    fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
+
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr();
+    g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
 extern "C" int ggml_backend_sycl_reg_devices();

 int ggml_backend_sycl_reg_devices() {
-    if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-    g_device_count = g_sycl_gpu_mgr->get_gpu_count();
+    ggml_backend_sycl_set_mul_device_mode();
    assert(g_device_count>0);
    for (int i = 0; i < g_device_count; i++) {
        int id = g_sycl_gpu_mgr->gpus[i];
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);

+// TODO: these are temporary
+//       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
    },
+    [GGML_TYPE_I64] = {
+        .type_name                = "i64",
+        .blck_size                = 1,
+        .type_size                = sizeof(int64_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F64] = {
+        .type_name                = "f64",
+        .blck_size                = 1,
+        .type_size                = sizeof(double),
+        .is_quantized             = false,
+        .nrows                    = 1,
+    },
    [GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
@ -918,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) {
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 #endif

+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = _mm512_reduce_add_ps(x[0]);                                 \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = _mm512_reduce_add_ps(x[0]);                             \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+
 #elif defined(__AVX__)

 #define GGML_SIMD
@ -12418,6 +12526,8 @@ static void ggml_compute_forward_alibi(
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
@ -12504,6 +12614,8 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
--- a/ggml.h
+++ b/ggml.h
@ -337,24 +337,24 @@ extern "C" {
    struct ggml_object;
    struct ggml_context;

+    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
+        GGML_TYPE_F32     = 0,
+        GGML_TYPE_F16     = 1,
+        GGML_TYPE_Q4_0    = 2,
+        GGML_TYPE_Q4_1    = 3,
        // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        GGML_TYPE_Q2_K = 10,
-        GGML_TYPE_Q3_K = 11,
-        GGML_TYPE_Q4_K = 12,
-        GGML_TYPE_Q5_K = 13,
-        GGML_TYPE_Q6_K = 14,
-        GGML_TYPE_Q8_K = 15,
+        // GGML_TYPE_Q4_3 = 5, support has been removed
+        GGML_TYPE_Q5_0    = 6,
+        GGML_TYPE_Q5_1    = 7,
+        GGML_TYPE_Q8_0    = 8,
+        GGML_TYPE_Q8_1    = 9,
+        GGML_TYPE_Q2_K    = 10,
+        GGML_TYPE_Q3_K    = 11,
+        GGML_TYPE_Q4_K    = 12,
+        GGML_TYPE_Q5_K    = 13,
+        GGML_TYPE_Q6_K    = 14,
+        GGML_TYPE_Q8_K    = 15,
        GGML_TYPE_IQ2_XXS = 16,
        GGML_TYPE_IQ2_XS  = 17,
        GGML_TYPE_IQ3_XXS = 18,
@ -363,9 +363,11 @@ extern "C" {
        GGML_TYPE_IQ3_S   = 21,
        GGML_TYPE_IQ2_S   = 22,
        GGML_TYPE_IQ4_XS  = 23,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
+        GGML_TYPE_I8      = 24,
+        GGML_TYPE_I16     = 25,
+        GGML_TYPE_I32     = 26,
+        GGML_TYPE_I64     = 27,
+        GGML_TYPE_F64     = 28,
        GGML_TYPE_COUNT,
    };

@ -383,20 +385,20 @@ extern "C" {

    // model file types
    enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_UNKNOWN        = -1,
+        GGML_FTYPE_ALL_F32        = 0,
+        GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -32,6 +32,7 @@ class Keys:
        FILE_TYPE            = "general.file_type"

    class LLM:
+        VOCAB_SIZE            = "{arch}.vocab_size"
        CONTEXT_LENGTH        = "{arch}.context_length"
        EMBEDDING_LENGTH      = "{arch}.embedding_length"
        BLOCK_COUNT           = "{arch}.block_count"
@ -41,6 +42,7 @@ class Keys:
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
        POOLING_TYPE          = "{arch}.pooling_type"
+        LOGIT_SCALE           = "{arch}.logit_scale"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@ -120,6 +122,7 @@ class MODEL_ARCH(IntEnum):
    GEMMA      = auto()
    STARCODER2 = auto()
    MAMBA      = auto()
+    COMMAND_R  = auto()


 class MODEL_TENSOR(IntEnum):
@ -186,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.GEMMA:          "gemma",
    MODEL_ARCH.STARCODER2:     "starcoder2",
    MODEL_ARCH.MAMBA:          "mamba",
+    MODEL_ARCH.COMMAND_R:      "command-r",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -578,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.SSM_D,
        MODEL_TENSOR.SSM_OUT,
    ],
+    MODEL_ARCH.COMMAND_R: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    # TODO
 }

@ -661,6 +677,11 @@ class GGMLQuantizationType(IntEnum):
    IQ3_S   = 21
    IQ2_S   = 22
    IQ4_XS  = 23
+    I8      = 24
+    I16     = 25
+    I32     = 26
+    I64     = 27
+    F64     = 28


 class GGUFEndian(IntEnum):
@ -727,6 +748,11 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
+    GGMLQuantizationType.I8:      (1, 1),
+    GGMLQuantizationType.I16:     (1, 2),
+    GGMLQuantizationType.I32:     (1, 4),
+    GGMLQuantizationType.I64:     (1, 8),
+    GGMLQuantizationType.F64:     (1, 8),
 }


@ -746,6 +772,7 @@ KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
 KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE

 # LLM
+KEY_VOCAB_SIZE            = Keys.LLM.VOCAB_SIZE
 KEY_CONTEXT_LENGTH        = Keys.LLM.CONTEXT_LENGTH
 KEY_EMBEDDING_LENGTH      = Keys.LLM.EMBEDDING_LENGTH
 KEY_BLOCK_COUNT           = Keys.LLM.BLOCK_COUNT
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -242,12 +242,27 @@ class GGUFReader:
            n_bytes = n_elems * type_size // block_size
            data_offs = int(start_offs + offset_tensor[0])
            item_type: npt.DTypeLike
-            if ggml_type == GGMLQuantizationType.F32:
-                item_count = n_elems
-                item_type = np.float32
-            elif ggml_type == GGMLQuantizationType.F16:
+            if ggml_type == GGMLQuantizationType.F16:
                item_count = n_elems
                item_type = np.float16
+            elif ggml_type == GGMLQuantizationType.F32:
+                item_count = n_elems
+                item_type = np.float32
+            elif ggml_type == GGMLQuantizationType.F64:
+                item_count = n_elems
+                item_type = np.float64
+            elif ggml_type == GGMLQuantizationType.I8:
+                item_count = n_elems
+                item_type = np.int8
+            elif ggml_type == GGMLQuantizationType.I16:
+                item_count = n_elems
+                item_type = np.int16
+            elif ggml_type == GGMLQuantizationType.I32:
+                item_count = n_elems
+                item_type = np.int32
+            elif ggml_type == GGMLQuantizationType.I64:
+                item_count = n_elems
+                item_type = np.int64
            else:
                item_count = n_bytes
                item_type = np.uint8
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -196,9 +196,6 @@ class GGUFWriter:
        if self.state is not WriterState.EMPTY:
            raise ValueError(f'Expected output file to be empty, got {self.state}')

-        if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
-            raise ValueError("Only F32 and F16 tensors are supported for now")
-
        encoded_name = name.encode("utf8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
@ -207,7 +204,22 @@ class GGUFWriter:
        for i in range(n_dims):
            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
-            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+            if tensor_dtype == np.float16:
+                dtype = GGMLQuantizationType.F16
+            elif tensor_dtype == np.float32:
+                dtype = GGMLQuantizationType.F32
+            elif tensor_dtype == np.float64:
+                dtype = GGMLQuantizationType.F64
+            elif tensor_dtype == np.int8:
+                dtype = GGMLQuantizationType.I8
+            elif tensor_dtype == np.int16:
+                dtype = GGMLQuantizationType.I16
+            elif tensor_dtype == np.int32:
+                dtype = GGMLQuantizationType.I32
+            elif tensor_dtype == np.int64:
+                dtype = GGMLQuantizationType.I64
+            else:
+                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
        self.ti_data += self._pack("I", dtype)
@ -313,6 +325,9 @@ class GGUFWriter:
        self.data_alignment = alignment
        self.add_uint32(Keys.General.ALIGNMENT, alignment)

+    def add_vocab_size(self, size: int) -> None:
+        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
+
    def add_context_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)

@ -346,6 +361,9 @@ class GGUFWriter:
    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)

+    def add_logit_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
+
    def add_expert_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)

--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.7.0"
+version = "0.8.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/llama.cpp
+++ b/llama.cpp
@ -214,6 +214,7 @@ enum llm_arch {
    LLM_ARCH_GEMMA,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
+    LLM_ARCH_COMMAND_R,
    LLM_ARCH_UNKNOWN,
 };

@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_GEMMA,           "gemma"      },
    { LLM_ARCH_STARCODER2,      "starcoder2" },
    { LLM_ARCH_MAMBA,           "mamba"      },
+    { LLM_ARCH_COMMAND_R,       "command-r"  },
    { LLM_ARCH_UNKNOWN,         "(unknown)"  },
 };

@ -258,6 +260,7 @@ enum llm_kv {
    LLM_KV_GENERAL_SOURCE_URL,
    LLM_KV_GENERAL_SOURCE_HF_REPO,

+    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_BLOCK_COUNT,
@ -267,6 +270,7 @@ enum llm_kv {
    LLM_KV_EXPERT_COUNT,
    LLM_KV_EXPERT_USED_COUNT,
    LLM_KV_POOLING_TYPE,
+    LLM_KV_LOGIT_SCALE,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -321,6 +325,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_SOURCE_URL,            "general.source.url"                    },
    { LLM_KV_GENERAL_SOURCE_HF_REPO,        "general.source.huggingface.repository" },

+    { LLM_KV_VOCAB_SIZE,                    "%s.vocab_size"            },
    { LLM_KV_CONTEXT_LENGTH,                "%s.context_length"        },
    { LLM_KV_EMBEDDING_LENGTH,              "%s.embedding_length"      },
    { LLM_KV_BLOCK_COUNT,                   "%s.block_count"           },
@ -330,6 +335,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
+    { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },

    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@ -836,6 +842,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
        },
    },
+    {
+        LLM_ARCH_COMMAND_R,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -1595,6 +1616,7 @@ enum e_model {
    MODEL_20B,
    MODEL_30B,
    MODEL_34B,
+    MODEL_35B,
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
@ -1641,6 +1663,7 @@ struct llama_hparams {

    float f_clamp_kqv      = 0.0f;
    float f_max_alibi_bias = 0.0f;
+    float f_logit_scale    = 0.0f;

    bool causal_attn = true;
    bool need_kq_pos = false;
@ -1871,6 +1894,31 @@ struct llama_kv_cache {
    }
 };

+struct llama_control_vector {
+    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+
+    ggml_tensor * tensor_for(int il) const {
+        if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+            return nullptr;
+        }
+        return tensors[il];
+    }
+
+    ~llama_control_vector() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
@ -2085,6 +2133,9 @@ struct llama_context {
    struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
    struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]

+    // control vectors
+    struct llama_control_vector cvec;
+
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
 #endif
@ -3229,6 +3280,7 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_20B:    return "20B";
        case MODEL_30B:    return "30B";
        case MODEL_34B:    return "34B";
+        case MODEL_35B:    return "35B";
        case MODEL_40B:    return "40B";
        case MODEL_65B:    return "65B";
        case MODEL_70B:    return "70B";
@ -3242,10 +3294,11 @@ static const char * llama_model_type_name(e_model type) {

 static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
    switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM: return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE: return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM: return "WPM";
-        default:                   return "unknown";
+        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
+        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
+        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
+        default:                    return "unknown";
    }
 }

@ -3277,14 +3330,14 @@ static void llm_load_hparams(
    ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);

    // get hparams kv
-    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
-    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
-    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
-    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
-    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
-    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
-    ml.get_key  (LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
-    ml.get_key  (LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE,           hparams.n_vocab,       false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+    ml.get_key(LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
+    ml.get_key(LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
+    ml.get_key(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
+    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
+    ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
+    ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);

    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@ -3620,6 +3673,15 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 40: model.type = e_model::MODEL_35B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
        default: (void)0;
    }

@ -3645,30 +3707,25 @@ static void llm_load_vocab(

    const auto kv = LLM_KV(model.arch);

-    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
-    if (token_idx == -1) {
-        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
-    }
-
-    const float * scores = nullptr;
-    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
-    if (score_idx != -1) {
-        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-    }
-
-    const int * toktypes = nullptr;
-    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
-    if (toktype_idx != -1) {
-        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
-    }
-
    // determine vocab type
    {
        std::string tokenizer_name;

        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);

-        if (tokenizer_name == "llama") {
+        if (tokenizer_name == "no_vocab") {
+            vocab.type = LLAMA_VOCAB_TYPE_NONE;
+
+            // default special tokens
+            vocab.special_bos_id = -1;
+            vocab.special_eos_id = -1;
+            vocab.special_unk_id = -1;
+            vocab.special_sep_id = -1;
+            vocab.special_pad_id = -1;
+            vocab.linefeed_id    = -1;
+
+            return;
+        } else if (tokenizer_name == "llama") {
            vocab.type = LLAMA_VOCAB_TYPE_SPM;

            // default special tokens
@ -3734,6 +3791,23 @@ static void llm_load_vocab(
        }
    }

+    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+    if (token_idx == -1) {
+        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+    }
+
+    const float * scores = nullptr;
+    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+    if (score_idx != -1) {
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+    }
+
+    const int * toktypes = nullptr;
+    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+    if (toktype_idx != -1) {
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+    }
+
    const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);

    vocab.id_to_token.resize(n_vocab);
@ -3929,10 +4003,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+    LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
    LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
    LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
    LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
-    LLAMA_LOG_INFO("%s: causal attm      = %d\n",     __func__, hparams.causal_attn);
+    LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
    LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
    LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
@ -4903,6 +4978,37 @@ static bool llm_load_tensors(
                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
                    }
                } break;
+            case LLM_ARCH_COMMAND_R:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        // init output from the input tok embed
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -5023,7 +5129,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam

        llm_load_print_meta(ml, model);

-        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
+        if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
+            model.hparams.n_vocab != model.vocab.id_to_token.size()) {
            throw std::runtime_error("vocab size mismatch");
        }

@ -5048,6 +5155,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
        }
 #endif

+#ifdef GGML_USE_SYCL
+        if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+            ggml_backend_sycl_set_single_device_mode(params.main_gpu);
+            //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
+            params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
+        } else {
+            ggml_backend_sycl_set_mul_device_mode();
+        }
+#endif
+
        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
@ -5842,6 +5959,12 @@ struct llm_build_context {
            }

            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+            if (layer_dir != nullptr) {
+                cur = ggml_add(ctx0, cur, layer_dir);
+            }
            cb(cur, "l_out", il);

            // input for next layer
@ -5877,7 +6000,7 @@ struct llm_build_context {
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
+        struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@ -5927,7 +6050,6 @@ struct llm_build_context {
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);

-
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
@ -8289,6 +8411,121 @@ struct llm_build_context {

        return gf;
    }
+
+    struct ggml_cgraph * build_command_r() {
+
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, ffn_inp,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+
+    }
 };

 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -8471,6 +8708,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_mamba();
            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                result = llm.build_command_r();
+            } break;
        default:
            GGML_ASSERT(false);
    }
@ -9036,8 +9277,8 @@ static int llama_decode_internal(
    //llama_synchronize(&lctx);

    // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
+    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
+        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;

        // queue defragmentation for next llama_kv_cache_update
        if (fragmentation > cparams.defrag_thold) {
@ -9069,6 +9310,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
    // number of cells moved
    uint32_t n_moves = 0;

+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
+
    // determine which KV cells to move where
    //
    //  cell i moves to ids[i]
@ -9095,15 +9341,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            nh++;
        }

-        // each move requires 6*n_layer tensors (see build_defrag)
-        //   - source view, destination view, copy operation
-        //   - x2 for keys and values
-        //
-        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
-            // the graph is too big, we cannot move more cells
-            break;
-        }
-
        uint32_t nf = 0;
        uint32_t is = n_kv - 1;

@ -9133,11 +9370,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
        // are we moving a continuous block of memory?
        bool cont = false;

+        // should we stop searching for the next move?
+        bool stop = false;
+
        // go back and move the nf cells to the hole
        for (; i1 < n_kv; ++i1) {
            auto & cell1 = kv_self.cells[i1];

            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
                cont = false;
                continue;
            }
@ -9164,6 +9409,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            }
        }

+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);

        i0 += nh - 1;
@ -9353,26 +9602,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }

 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }

 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }

 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
 }

 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }

 static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
 }

 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto& token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
@ -9393,6 +9648,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }

 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
@ -10224,6 +10480,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                    }
                }
            } break;
+        case LLAMA_VOCAB_TYPE_NONE:
+            GGML_ASSERT(false);
    }

    return output;
@ -11944,7 +12202,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
    return new_type;
 }

-static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
    std::mutex mutex;
    int counter = 0;
    size_t new_size = 0;
@ -12888,23 +13146,22 @@ struct llama_context * llama_new_context_with_model(
        if (model->n_gpu_layers > 0) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-                int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
-                ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
+                ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
+                    int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
                    llama_free(ctx);
                    return nullptr;
                }
                ctx->backends.push_back(backend);
            } else {
                // LLAMA_SPLIT_LAYER requires a backend for each GPU
-                int id_list[GGML_SYCL_MAX_DEVICES];
-                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
                for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                    int device_id = id_list[i];
                    ggml_backend_t backend = ggml_backend_sycl_init(i);
                    if (backend == nullptr) {
-                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
+                        int id_list[GGML_SYCL_MAX_DEVICES];
+                        ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
+                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
                        llama_free(ctx);
                        return nullptr;
                    }
@ -13105,6 +13362,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_COMMAND_R:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@ -13130,7 +13388,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 }

 int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
+    return model->hparams.n_vocab;
 }

 int32_t llama_n_ctx_train(const struct llama_model * model) {
@ -13141,6 +13399,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }

+int32_t llama_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
 }
@ -13240,6 +13502,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
    }
 }

+static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+    GGML_ASSERT(cvec.tensors.empty());
+    GGML_ASSERT(cvec.ctxs.empty());
+    GGML_ASSERT(cvec.bufs.empty());
+
+    // count layer buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
+        buft_layer_count[model.buft_layer[i].buft]++;
+    }
+
+    // allocate contexts
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (auto & it : buft_layer_count) {
+        int n_layers = it.second;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ n_layers * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return 1;
+        }
+        ctx_map[it.first] = ctx;
+    }
+
+    // make tensors
+    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
+        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+        cvec.tensors.push_back(tensor);
+    }
+
+    // allocate tensors / buffers and zero
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        cvec.ctxs.push_back(ctx);
+        cvec.bufs.push_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
+    const llama_model & model = lctx->model;
+    llama_control_vector & cvec = lctx->cvec;
+
+    if (data == nullptr) {
+        // disable the current control vector (but leave allocated for later)
+        cvec.layer_start = -1;
+        cvec.layer_end   = -1;
+        return 0;
+    }
+
+    if (n_embd != (int) model.hparams.n_embd) {
+        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        return 1;
+    }
+
+    if (cvec.tensors.empty()) {
+        if (!llama_control_vector_init(cvec, model)) {
+            return 1;
+        }
+    }
+
+    cvec.layer_start = il_start;
+    cvec.layer_end   = il_end;
+
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        assert(cvec.tensors[il] != nullptr);
+
+        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+        if (off + n_embd <= len) {
+            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+        }
+    }
+
+    return 0;
+}
+
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
    struct llama_kv_cache_view result = {
        /*.n_cells            = */ 0,
@ -13954,14 +14306,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
 }

 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].text.c_str();
 }

 float llama_token_get_score(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].score;
 }

 llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].type;
 }

@ -14206,6 +14561,26 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<start_of_turn>model\n";
        }
+    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+        // OrionStarAI/Orion-14B-Chat
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message support, we will merge it with user prompt
+                system_prompt = message->content;
+                continue;
+            } else if (role == "user") {
+                ss << "Human: ";
+                if (!system_prompt.empty()) {
+                    ss << system_prompt << "\n\n";
+                    system_prompt = "";
+                }
+                ss << message->content << "\n\nAssistant: </s>";
+            } else {
+                ss << message->content << "</s>";
+            }
+        }
    } else {
        // template not supported
        return -1;
--- a/llama.h
+++ b/llama.h
@ -59,9 +59,10 @@ extern "C" {
    typedef int32_t llama_seq_id;

    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
+        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
    };

    // note: these values should be synchronized with ggml_rope
@ -387,6 +388,7 @@ extern "C" {
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -434,10 +436,24 @@ extern "C" {
    // Returns 0 on success
    LLAMA_API int32_t llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads);
+                          const char * path_lora,
+                               float   scale,
+                          const char * path_base_model,
+                             int32_t   n_threads);
+
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);

    //
    // KV cache
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -31,6 +31,8 @@ int main(void) {
        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
        // google/gemma-7b-it
        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+        // OrionStarAI/Orion-14B-Chat
+        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -45,6 +47,8 @@ int main(void) {
        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
        // google/gemma-7b-it
        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        // OrionStarAI/Orion-14B-Chat
+        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;