abstract rpc server

2024-07-26 18:10:20 -07:00 · 2024-07-26 18:10:20 -07:00 · 3ab3eca372
commit 3ab3eca372
parent da6c1f29f2
59 changed files with 31 additions and 11024 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -21,9 +21,4 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[examples/server/public/*]
-indent_size = 2
-
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
-indent_style = tab

--- a/.gitignore
+++ b/.gitignore
@ -87,19 +87,19 @@ ppl-*.txt
 qnt-*.txt
 perf-*.txt

-# Examples
+# core

-examples/jeopardy/results.txt
-examples/server/*.css.hpp
-examples/server/*.html.hpp
-examples/server/*.js.hpp
-examples/server/*.mjs.hpp
+core/jeopardy/results.txt
+core/server/*.css.hpp
+core/server/*.html.hpp
+core/server/*.js.hpp
+core/server/*.mjs.hpp
 !build_64.sh
-!examples/*.bat
-!examples/*/*.kts
-!examples/*/*/*.kts
-!examples/sycl/*.bat
-!examples/sycl/*.sh
+!core/*.bat
+!core/*/*.kts
+!core/*/*/*.kts
+!core/sycl/*.bat
+!core/sycl/*.sh

 # Python

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -183,12 +183,12 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)

 #
-# programs, examples
+# programs, core
 #

 add_subdirectory(common)

 if (LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
+    add_subdirectory(core)
    add_subdirectory(pocs)
 endif()
--- a/12
+++ b/12
@ -1065,7 +1065,7 @@ $(LIB_COMMON_S): \
 clean:
 	rm -vrf *.dot $(BUILD_TARGETS)
 	rm -rvf src/*.o
-	rm -rvf examples/*.o
+	rm -rvf core/*.o
 	rm -rvf common/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
@ -1082,10 +1082,10 @@ clean:
 	rm -rvf $(BUILD_TARGETS)
 	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
 	rm -rvf $(LEGACY_TARGETS_CLEAN)
-	find examples pocs -type f -name "*.o" -delete
+	find core pocs -type f -name "*.o" -delete

 #
-# Examples
+# core
 #

 # $< is the first prerequisite, i.e. the source file.
@ -1095,7 +1095,7 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))

-llama-cli: examples/main/main.cpp \
+llama-cli: core/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1104,7 +1104,7 @@ llama-cli: examples/main/main.cpp \
 	@echo

 ifdef GGML_RPC
-rpc-server: examples/rpc/rpc-server.cpp \
+rpc-server: core/rpc/rpc-server.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC
@ -1142,7 +1142,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \

 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
-main: examples/deprecation-warning/deprecation-warning.cpp
+main: core/deprecation-warning/deprecation-warning.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -1,351 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
-#
-# This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
-# the same pre-tokenizer.
-#
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
-#
-# Instructions:
-#
-# - Add a new model to the "models" list
-# - Run the script with your huggingface token:
-#
-#   python3 convert_hf_to_gguf_update.py <huggingface_token>
-#
-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
-# - Update llama.cpp with the new pre-tokenizer if necessary
-#
-# TODO: generate tokenizer tests for llama.cpp
-#
-
-import logging
-import os
-import pathlib
-import re
-
-import requests
-import sys
-import json
-
-from hashlib import sha256
-from enum import IntEnum, auto
-from transformers import AutoTokenizer
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert_hf_to_gguf_update")
-sess = requests.Session()
-
-
-class TOKENIZER_TYPE(IntEnum):
-    SPM = auto()
-    BPE = auto()
-    WPM = auto()
-    UGM = auto()
-
-
-# TODO: this string has to exercise as much pre-tokenizer functionality as possible
-#       will be updated with time - contributions welcome
-CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
-
-if len(sys.argv) == 2:
-    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-        sys.exit(1)
-else:
-    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-    sys.exit(1)
-
-# TODO: add models here, base models preferred
-models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
-    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
-    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
-    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
-    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
-    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
-    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
-    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
-    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
-    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
-    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
-]
-
-
-def download_file_with_auth(url, token, save_path):
-    headers = {"Authorization": f"Bearer {token}"}
-    response = sess.get(url, headers=headers)
-    response.raise_for_status()
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as downloaded_file:
-        downloaded_file.write(response.content)
-    logger.info(f"File {save_path} downloaded successfully")
-
-
-def download_model(model):
-    name = model["name"]
-    repo = model["repo"]
-    tokt = model["tokt"]
-
-    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
-
-    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
-
-    if tokt == TOKENIZER_TYPE.SPM:
-        files.append("tokenizer.model")
-
-    if tokt == TOKENIZER_TYPE.UGM:
-        files.append("spiece.model")
-
-    for file in files:
-        save_path = f"models/tokenizers/{name}/{file}"
-        if os.path.isfile(save_path):
-            logger.info(f"{name}: File {save_path} already exists - skipping")
-            continue
-        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
-
-
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
-
-
-# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
-
-src_ifs = ""
-for model in models:
-    name = model["name"]
-    tokt = model["tokt"]
-
-    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
-        continue
-
-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
-    # create the tokenizer
-    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-        continue  # Skip to the next model if the tokenizer can't be loaded
-
-    chktok = tokenizer.encode(CHK_TXT)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-    logger.info(f"model: {name}")
-    logger.info(f"tokt: {tokt}")
-    logger.info(f"repo: {model['repo']}")
-    logger.info(f"chktok: {chktok}")
-    logger.info(f"chkhsh: {chkhsh}")
-
-    # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
-        pre_tokenizer = cfg["pre_tokenizer"]
-        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
-
-    logger.info("")
-
-    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
-    src_ifs += f"            # ref: {model['repo']}\n"
-    src_ifs += f"            res = \"{name}\"\n"
-
-src_func = f"""
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
-        # is specific for the BPE pre-tokenizer used by the model
-        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
-
-        chktxt = {repr(CHK_TXT)}
-
-        chktok = tokenizer.encode(chktxt)
-        chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-        logger.debug(f"chktok: {{chktok}}")
-        logger.debug(f"chkhsh: {{chkhsh}}")
-
-        res = None
-
-        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
-        #       or pull the latest version of the model from Huggingface
-        #       don't edit the hashes manually!
-{src_ifs}
-        if res is None:
-            logger.warning("\\n")
-            logger.warning("**************************************************************************************")
-            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
-            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
-            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
-            logger.warning("**")
-            logger.warning(f"** chkhsh:  {{chkhsh}}")
-            logger.warning("**************************************************************************************")
-            logger.warning("\\n")
-            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
-
-        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
-        logger.debug(f"chkhsh: {{chkhsh}}")
-
-        return res
-"""
-
-convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
-convert_py = convert_py_pth.read_text(encoding="utf-8")
-convert_py = re.sub(
-    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
-    lambda m: m.group(1) + src_func + m.group(3),
-    convert_py,
-    flags=re.DOTALL | re.MULTILINE,
-)
-
-convert_py_pth.write_text(convert_py, encoding="utf-8")
-
-logger.info("+++ convert_hf_to_gguf.py was updated")
-
-# generate tests for each tokenizer model
-
-tests = [
-    "ied 4 ½ months",
-    "Führer",
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\n\n",
-    "\n\n\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-    " (",
-    "\n =",
-    "' era",
-    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "!!!!!!",
-    "3",
-    "33",
-    "333",
-    "3333",
-    "33333",
-    "333333",
-    "3333333",
-    "33333333",
-    "333333333",
-    "Cửa Việt", # llama-bpe fails on this
-    " discards",
-    CHK_TXT,
-]
-
-# write the tests to ./models/ggml-vocab-{name}.gguf.inp
-# the format is:
-#
-# test0
-# __ggml_vocab_test__
-# test1
-# __ggml_vocab_test__
-# ...
-#
-
-# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
-# for each test, write the resulting tokens on a separate line
-
-for model in models:
-    name = model["name"]
-    tokt = model["tokt"]
-
-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
-    # create the tokenizer
-    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
-        continue  # Skip this model and continue with the next one in the loop
-
-    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
-        for text in tests:
-            f.write(f"{text}")
-            f.write("\n__ggml_vocab_test__\n")
-
-    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
-        for text in tests:
-            res = tokenizer.encode(text, add_special_tokens=False)
-            for r in res:
-                f.write(f" {r}")
-            f.write("\n")
-
-    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
-
-# generate commands for creating vocab files
-
-logger.info("\nRun the following commands to generate the vocab files for testing:\n")
-
-for model in models:
-    name = model["name"]
-
-    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
-
-logger.info("\n")
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -1,450 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import logging
-import argparse
-import os
-import struct
-import sys
-from enum import IntEnum
-from pathlib import Path
-
-import numpy as np
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-logger = logging.getLogger("ggml-to-gguf")
-
-
-class GGMLFormat(IntEnum):
-    GGML = 0
-    GGMF = 1
-    GGJT = 2
-
-
-class GGMLFType(IntEnum):
-    ALL_F32              = 0
-    MOSTLY_F16           = 1
-    MOSTLY_Q4_0          = 2
-    MOSTLY_Q4_1          = 3
-    MOSTLY_Q4_1_SOME_F16 = 4
-    MOSTLY_Q8_0          = 7
-    MOSTLY_Q5_0          = 8
-    MOSTLY_Q5_1          = 9
-    MOSTLY_Q2_K          = 10
-    MOSTLY_Q3_K_S        = 11
-    MOSTLY_Q3_K_M        = 12
-    MOSTLY_Q3_K_L        = 13
-    MOSTLY_Q4_K_S        = 14
-    MOSTLY_Q4_K_M        = 15
-    MOSTLY_Q5_K_S        = 16
-    MOSTLY_Q5_K_M        = 17
-    MOSTLY_Q6_K          = 18
-
-
-class Hyperparameters:
-    def __init__(self):
-        self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
-        self.n_layer = self.n_rot = self.n_ff = 0
-        self.ftype = GGMLFType.ALL_F32
-
-    def set_n_ff(self, model):
-        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
-        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
-        ff_tensor = model.tensors[ff_tensor_idx]
-        self.n_ff = ff_tensor.dims[1]
-
-    def load(self, data, offset):
-        (
-            self.n_vocab,
-            self.n_embd,
-            self.n_mult,
-            self.n_head,
-            self.n_layer,
-            self.n_rot,
-            ftype,
-        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
-        try:
-            self.ftype = GGMLFType(ftype)
-        except ValueError:
-            raise ValueError(f'Invalid ftype {ftype}')
-        return 4 * 7
-
-    def __str__(self):
-        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
-
-
-class Vocab:
-    def __init__(self, load_scores = True):
-        self.items = []
-        self.load_scores = load_scores
-
-    def load(self, data, offset, n_vocab):
-        orig_offset = offset
-        for _ in range(n_vocab):
-            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
-            assert itemlen < 4096, 'Absurd vocab item length'
-            offset += 4
-            item_text = bytes(data[offset:offset + itemlen])
-            offset += itemlen
-            if self.load_scores:
-                item_score = struct.unpack('<f', data[offset:offset + 4])[0]
-                offset += 4
-            else:
-                item_score = 0.0
-            self.items.append((item_text, item_score))
-        return offset - orig_offset
-
-
-class Tensor:
-    def __init__(self, use_padding = True):
-        self.name = None
-        self.dims: tuple[int, ...] = ()
-        self.dtype = None
-        self.start_offset = 0
-        self.len_bytes = np.int64(0)
-        self.use_padding = use_padding
-
-    def load(self, data, offset):
-        orig_offset = offset
-        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
-        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
-        assert name_len < 4096, 'Absurd tensor name length'
-        quant = gguf.GGML_QUANT_SIZES.get(dtype)
-        assert quant is not None, 'Unknown tensor type'
-        (blksize, tysize) = quant
-        offset += 12
-        self.dtype= dtype
-        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
-        offset += 4 * n_dims
-        self.name = bytes(data[offset:offset + name_len])
-        offset += name_len
-        pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
-        offset += pad
-        n_elems = np.prod(self.dims)
-        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
-        self.start_offset = offset
-        self.len_bytes = n_bytes
-        offset += n_bytes
-        return offset - orig_offset
-
-
-class GGMLModel:
-
-    file_format: GGMLFormat
-    format_version: int
-
-    def __init__(self):
-        self.hyperparameters = None
-        self.vocab = None
-        self.tensor_map = {}
-        self.tensors = []
-
-    def validate_header(self, data, offset):
-        magic = bytes(data[offset:offset + 4])
-        if magic == b'GGUF':
-            raise ValueError('File is already in GGUF format.')
-        if magic == b'lmgg':
-            self.file_format = GGMLFormat.GGML
-            self.format_version = 1
-            return 4
-        version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
-        if magic == b'fmgg':
-            if version != 1:
-                raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
-            self.file_format = GGMLFormat.GGMF
-            self.format_version = version
-            return 8
-        if magic == b'tjgg':
-            if version < 1 or version > 3:
-                raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
-            self.file_format = GGMLFormat.GGJT
-            self.format_version = version
-            return 8
-        raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
-
-    def validate_conversion(self, ftype):
-        err = ''
-        if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
-            if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
-                err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
-        elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
-            if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
-                         GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
-                err = 'Q4 and Q8 quantizations changed in GGJTv3.'
-        if len(err) > 0:
-            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
-
-    def load(self, data, offset):
-        offset += self.validate_header(data, offset)
-        hp = Hyperparameters()
-        offset += hp.load(data, offset)
-        logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
-        self.validate_conversion(hp.ftype)
-        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
-        offset += vocab.load(data, offset, hp.n_vocab)
-        tensors: list[Tensor] = []
-        tensor_map = {}
-        while offset < len(data):
-            tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
-            offset += tensor.load(data, offset)
-            tensor_map[tensor.name] = len(tensors)
-            tensors.append(tensor)
-        self.hyperparameters = hp
-        self.vocab = vocab
-        self.tensors = tensors
-        self.tensor_map = tensor_map
-        hp.set_n_ff(self)
-        return offset
-
-
-class GGMLToGGUF:
-    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
-        hp = ggml_model.hyperparameters
-        self.model = ggml_model
-        self.data = data
-        self.cfg = cfg
-        self.params_override = params_override
-        self.vocab_override = vocab_override
-        self.special_vocab = special_vocab
-        if params_override is not None:
-            n_kv_head = params_override.n_head_kv
-        else:
-            if cfg.gqa == 1:
-                n_kv_head = hp.n_head
-            else:
-                gqa = float(cfg.gqa)
-                n_kv_head = None
-                for x in range(1, 256):
-                    if float(hp.n_head) / float(x) == gqa:
-                        n_kv_head = x
-                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
-                logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
-        self.n_kv_head = n_kv_head
-        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
-
-    def save(self):
-        logger.info('* Preparing to save GGUF file')
-        gguf_writer = gguf.GGUFWriter(
-            self.cfg.output,
-            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
-            use_temp_file = False)
-        self.add_params(gguf_writer)
-        self.add_vocab(gguf_writer)
-        if self.special_vocab is not None:
-            self.special_vocab.add_to_gguf(gguf_writer)
-        self.add_tensors(gguf_writer)
-        logger.info("    gguf: write header")
-        gguf_writer.write_header_to_file()
-        logger.info("    gguf: write metadata")
-        gguf_writer.write_kv_data_to_file()
-        logger.info("    gguf: write tensors")
-        gguf_writer.write_tensors_to_file()
-        gguf_writer.close()
-
-    def add_params(self, gguf_writer):
-        hp = self.model.hyperparameters
-        cfg = self.cfg
-        if cfg.desc is not None:
-            desc = cfg.desc
-        else:
-            desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
-        try:
-            # Filenames aren't necessarily valid UTF8.
-            name = cfg.name if cfg.name is not None else cfg.input.name
-        except UnicodeDecodeError:
-            name = None
-        logger.info('* Adding model parameters and KV items')
-        if name is not None:
-            gguf_writer.add_name(name)
-        gguf_writer.add_description(desc)
-        gguf_writer.add_file_type(int(hp.ftype))
-        if self.params_override is not None:
-            po = self.params_override
-            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
-            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
-            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
-            gguf_writer.add_context_length      (po.n_ctx)
-            gguf_writer.add_embedding_length    (po.n_embd)
-            gguf_writer.add_block_count         (po.n_layer)
-            gguf_writer.add_feed_forward_length (po.n_ff)
-            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
-            gguf_writer.add_head_count          (po.n_head)
-            gguf_writer.add_head_count_kv       (po.n_head_kv)
-            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
-            return
-        gguf_writer.add_context_length(cfg.context_length)
-        gguf_writer.add_embedding_length(hp.n_embd)
-        gguf_writer.add_block_count(hp.n_layer)
-        gguf_writer.add_feed_forward_length(hp.n_ff)
-        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
-        gguf_writer.add_head_count(hp.n_head)
-        gguf_writer.add_head_count_kv(self.n_kv_head)
-        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
-
-    def add_vocab(self, gguf_writer):
-        hp = self.model.hyperparameters
-        gguf_writer.add_tokenizer_model('llama')
-        gguf_writer.add_tokenizer_pre('default')
-        tokens = []
-        scores = []
-        toktypes = []
-        if self.vocab_override is not None:
-            vo = self.vocab_override
-            logger.info('* Adding vocab item(s)')
-            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
-                tokens.append(vbytes)
-                scores.append(score)
-                toktypes.append(ttype)
-            assert len(tokens) == hp.n_vocab, \
-                f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
-            gguf_writer.add_token_list(tokens)
-            gguf_writer.add_token_scores(scores)
-            if len(toktypes) > 0:
-                gguf_writer.add_token_types(toktypes)
-            return
-        logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
-        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
-        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
-            tt = 1 # Normal
-            # Special handling for UNK, BOS, EOS tokens.
-            if tokid <= 2:
-                if tokid == 0:
-                    vbytes = b'<unk>'
-                    tt = 2
-                elif tokid == 1:
-                    vbytes = b'<s>'
-                    tt = 3
-                else:
-                    vbytes = b'</s>'
-                    tt = 3
-            elif len(vbytes) == 0:
-                tt = 3 # Control
-            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
-                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
-                tt = 6 # Byte
-            else:
-                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
-            toktypes.append(tt)
-            tokens.append(vbytes)
-            scores.append(vscore)
-        gguf_writer.add_token_list(tokens)
-        gguf_writer.add_token_scores(scores)
-        gguf_writer.add_token_types(toktypes)
-        gguf_writer.add_unk_token_id(0)
-        gguf_writer.add_bos_token_id(1)
-        gguf_writer.add_eos_token_id(2)
-
-    def add_tensors(self, gguf_writer):
-        tensor_map = self.name_map
-        data = self.data
-        logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
-        for tensor in self.model.tensors:
-            name = str(tensor.name, 'UTF-8')
-            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-            assert mapped_name is not None, f'Bad name {name}'
-            tempdims = list(tensor.dims[:])
-            if len(tempdims) > 1:
-                temp = tempdims[1]
-                tempdims[1] = tempdims[0]
-                tempdims[0] = temp
-            gguf_writer.add_tensor(
-                mapped_name,
-                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
-                raw_shape = tempdims,
-                raw_dtype = tensor.dtype)
-
-
-def handle_metadata(cfg, hp):
-    import examples.convert_legacy_llama as convert
-
-    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
-    hf_config_path   = cfg.model_metadata_dir / "config.json"
-    orig_config_path = cfg.model_metadata_dir / "params.json"
-    # We pass a fake model here. "original" mode will check the shapes of some
-    # tensors if information is missing in the .json file: other than that, the
-    # model data isn't used so this should be safe (at least for now).
-    fakemodel = {
-        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
-        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
-    }
-    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
-    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
-    if hf_config_path.exists():
-        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
-    elif orig_config_path.exists():
-        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
-    else:
-        raise ValueError('Unable to load metadata')
-    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-    vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
-    convert.check_vocab_size(params, vocab)
-    return params, vocab, special_vocab
-
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
-    parser.add_argument('--input', '-i', type = Path, required = True,
-                        help = 'Input GGMLv3 filename')
-    parser.add_argument('--output', '-o', type = Path, required = True,
-                        help ='Output GGUF filename')
-    parser.add_argument('--name',
-                        help = 'Set model name')
-    parser.add_argument('--desc',
-                        help = 'Set model description')
-    parser.add_argument('--gqa', type = int, default = 1,
-                        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
-    parser.add_argument('--eps', default = '5.0e-06',
-                        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
-    parser.add_argument('--context-length', '-c', type=int, default = 2048,
-                        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
-    parser.add_argument('--model-metadata-dir', '-m', type = Path,
-                        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
-    parser.add_argument("--vocab-dir", type=Path,
-                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", default="spm,hfft",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
-    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
-    return parser.parse_args()
-
-
-def main():
-    cfg = handle_args()
-    logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
-    logger.info(f'* Using config: {cfg}')
-    logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
-    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
-    data = np.memmap(cfg.input, mode = 'r')
-    model = GGMLModel()
-    logger.info('* Scanning GGML input file')
-    offset = model.load(data, 0)  # noqa
-    logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
-    vocab_override = None
-    params_override = None
-    special_vocab = None
-    if cfg.model_metadata_dir is not None:
-        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
-        logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
-        logger.info(f'* Overriding params: {params_override}')
-        logger.info(f'* Overriding vocab: {vocab_override}')
-        logger.info(f'* Special vocab: {special_vocab}')
-    else:
-        logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
-        if model.file_format == GGMLFormat.GGML:
-            logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
-    converter = GGMLToGGUF(
-        model, data, cfg,
-        params_override = params_override,
-        vocab_override = vocab_override,
-        special_vocab = special_vocab
-    )
-    converter.save()
-    logger.info(f'* Successful completion. Output saved to: {cfg.output}')
-
-
-if __name__ == '__main__':
-    main()
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -1,393 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-import logging
-import argparse
-import os
-import sys
-import json
-from math import prod
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-# reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
-
-logger = logging.getLogger("lora-to-gguf")
-
-
-@dataclass
-class PartialLoraTensor:
-    A: Tensor | None = None
-    B: Tensor | None = None
-
-
-# magic to support tensor shape modifications and splitting
-class LoraTorchTensor:
-    _lora_A: Tensor  # (n_rank, row_size)
-    _lora_B: Tensor  # (col_size, n_rank)
-    _rank: int
-
-    def __init__(self, A: Tensor, B: Tensor):
-        assert len(A.shape) == len(B.shape)
-        assert A.shape[-2] == B.shape[-1]
-        if A.dtype != B.dtype:
-            A = A.to(torch.float32)
-            B = B.to(torch.float32)
-        self._lora_A = A
-        self._lora_B = B
-        self._rank = B.shape[-1]
-
-    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
-        return (self._lora_A, self._lora_B)
-
-    def __getitem__(
-        self,
-        indices: (
-            SupportsIndex
-            | slice
-            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
-        ),
-    ) -> LoraTorchTensor:
-        shape = self.shape
-        if isinstance(indices, SupportsIndex):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                raise NotImplementedError  # can't return a vector
-        elif isinstance(indices, slice):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
-        elif isinstance(indices, tuple):
-            assert len(indices) > 0
-            if indices[-1] is Ellipsis:
-                return self[indices[:-1]]
-            # expand ellipsis
-            indices = tuple(
-                u
-                for v in (
-                    (
-                        (slice(None, None) for _ in range(len(indices) - 1))
-                        if i is Ellipsis
-                        else (i,)
-                    )
-                    for i in indices
-                )
-                for u in v
-            )
-
-            if len(indices) < len(shape):
-                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
-
-            # TODO: make sure this is correct
-            indices_A = (
-                *(
-                    (
-                        j.__index__() % self._lora_A.shape[i]
-                        if isinstance(j, SupportsIndex)
-                        else slice(None, None)
-                    )
-                    for i, j in enumerate(indices[:-2])
-                ),
-                slice(None, None),
-                indices[-1],
-            )
-            indices_B = indices[:-1]
-            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
-        else:
-            raise NotImplementedError  # unknown indice type
-
-    @property
-    def dtype(self) -> torch.dtype:
-        assert self._lora_A.dtype == self._lora_B.dtype
-        return self._lora_A.dtype
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        assert len(self._lora_A.shape) == len(self._lora_B.shape)
-        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
-
-    def size(self, dim=None):
-        assert dim is None
-        return self.shape
-
-    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
-        if isinstance(shape[0], tuple):
-            new_shape: tuple[int, ...] = shape[0]
-        else:
-            new_shape = cast(tuple[int, ...], shape)
-        orig_shape = self.shape
-        if len(new_shape) < 2:
-            raise NotImplementedError  # can't become a vector
-
-        # expand -1 in the shape
-        if any(dim == -1 for dim in new_shape):
-            n_elems = prod(orig_shape)
-            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
-            assert n_elems % n_new_elems == 0
-            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
-
-        if new_shape[-1] != orig_shape[-1]:
-            raise NotImplementedError  # can't reshape the row size trivially
-
-        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
-        shape_B = (*new_shape[:-1], self._rank)
-        return LoraTorchTensor(
-            self._lora_A.reshape(shape_A),
-            self._lora_B.reshape(shape_B),
-        )
-
-    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
-        return self.reshape(*other.shape)
-
-    def view(self, *size: int) -> LoraTorchTensor:
-        return self.reshape(*size)
-
-    def permute(self, *dims: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
-        if dims[-1] == -1:
-            # TODO: support higher dimensional A shapes bigger than 1
-            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
-            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
-        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
-            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
-        else:
-            # TODO: compose the above two
-            raise NotImplementedError
-
-    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = [i for i in range(len(shape))]
-        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
-        return self.permute(*dims)
-
-    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
-        return self.transpose(axis0, axis1)
-
-    def to(self, *args, **kwargs):
-        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
-
-    @classmethod
-    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
-        del types  # unused
-
-        if kwargs is None:
-            kwargs = {}
-
-        if func is torch.permute:
-            return type(args[0]).permute(*args, **kwargs)
-        elif func is torch.reshape:
-            return type(args[0]).reshape(*args, **kwargs)
-        elif func is torch.stack:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            return LoraTorchTensor(
-                torch.stack([a._lora_A for a in args[0]], dim),
-                torch.stack([b._lora_B for b in args[0]], dim),
-            )
-        elif func is torch.cat:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            if len(args[0][0].shape) > 2:
-                return LoraTorchTensor(
-                    torch.cat([a._lora_A for a in args[0]], dim),
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
-                return LoraTorchTensor(
-                    args[0][0]._lora_A,
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            else:
-                raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-
-def get_base_tensor_name(lora_tensor_name: str) -> str:
-    base_name = lora_tensor_name.replace("base_model.model.", "")
-    base_name = base_name.replace(".lora_A.weight", ".weight")
-    base_name = base_name.replace(".lora_B.weight", ".weight")
-    return base_name
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
-    )
-    parser.add_argument(
-        "--bigendian", action="store_true",
-        help="model is executed on big endian machine",
-    )
-    parser.add_argument(
-        "--no-lazy", action="store_true",
-        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="increase output verbosity",
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true",
-        help="only print out what will be done, without writing any new files",
-    )
-    parser.add_argument(
-        "--base", type=Path, required=True,
-        help="directory containing base model file",
-    )
-    parser.add_argument(
-        "lora_path", type=Path,
-        help="directory containing LoRA adapter file",
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
-    }
-
-    ftype = ftype_map[args.outtype]
-
-    dir_base_model: Path = args.base
-    dir_lora: Path = args.lora_path
-    lora_config = dir_lora / "adapter_config.json"
-    input_model = dir_lora / "adapter_model.safetensors"
-
-    if args.outfile is not None:
-        fname_out = args.outfile
-    else:
-        # output in the same directory as the model by default
-        fname_out = dir_lora
-
-    if os.path.exists(input_model):
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-
-        lora_model = load_file(input_model, device="cpu")
-    else:
-        input_model = os.path.join(dir_lora, "adapter_model.bin")
-        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
-
-    # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
-    with torch.inference_mode():
-        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
-        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
-            sys.exit(1)
-
-        class LoraModel(model_class):
-            model_arch = model_class.model_arch
-
-            lora_alpha: float
-
-            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
-
-                super().__init__(*args, **kwargs)
-
-                self.dir_model_card = dir_lora_model
-                self.lora_alpha = float(lora_alpha)
-
-            def set_type(self):
-                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
-                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
-
-            def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
-                super().set_gguf_parameters()
-
-            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-                tensor_map: dict[str, PartialLoraTensor] = {}
-
-                for name, tensor in lora_model.items():
-                    if self.lazy:
-                        tensor = LazyTorchTensor.from_eager(tensor)
-                    base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
-                    if not is_lora_a and not is_lora_b:
-                        if ".base_layer.weight" in name:
-                            continue
-                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
-                        sys.exit(1)
-
-                    if base_name in tensor_map:
-                        if is_lora_a:
-                            tensor_map[base_name].A = tensor
-                        else:
-                            tensor_map[base_name].B = tensor
-                    else:
-                        if is_lora_a:
-                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
-                        else:
-                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
-
-                for name, tensor in tensor_map.items():
-                    assert tensor.A is not None
-                    assert tensor.B is not None
-                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
-
-            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
-                for dest_name, dest_data in dest:
-                    assert isinstance(dest_data, LoraTorchTensor)
-                    lora_a, lora_b = dest_data.get_lora_A_B()
-
-                    yield (dest_name + ".lora_a", lora_a)
-                    yield (dest_name + ".lora_b", lora_b)
-
-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
-        alpha: float = lparams["lora_alpha"]
-
-        model_instance = LoraModel(
-            dir_base_model,
-            ftype,
-            fname_out,
-            is_big_endian=args.bigendian,
-            use_temp_file=False,
-            eager=args.no_lazy,
-            dry_run=args.dry_run,
-            dir_lora_model=dir_lora,
-            lora_alpha=alpha,
-        )
-
-        logger.info("Exporting model...")
-        model_instance.write()
-        logger.info(f"Model successfully exported to {model_instance.fname_out}")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -6,7 +6,7 @@ find_package(Threads REQUIRED)

 # ...

-# examples
+# core

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

@ -16,6 +16,4 @@ else()
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
-    if (LLAMA_BUILD_SERVER)
-    endif()
 endif()
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
--- a/examples/main/README.md
+++ b/examples/main/README.md
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -105,7 +105,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            antigma_print_timings(*g_ctx);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -992,7 +992,7 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    llama_print_timings(ctx);
+    antigma_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    if (ctx_guidance) { llama_free(ctx_guidance); }
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
--- a/docs/android.md
+++ b/docs/android.md
@ -1,56 +0,0 @@
-
-# Android
-
-## Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
-## Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
-```
-
-Here's a demo of an interactive session running on Pixel 5 phone:
-
-https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@ -1,67 +0,0 @@
-BLIS Installation Manual
------------------------
-
-BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
-
-Project URL: https://github.com/flame/blis
-
-### Prepare:
-
-Compile BLIS:
-
-```bash
-git clone https://github.com/flame/blis
-cd blis
-./configure --enable-cblas -t openmp,pthreads auto
-# will install to /usr/local/ by default.
-make -j
-```
-
-Install BLIS:
-
-```bash
-sudo make install
-```
-
-We recommend using openmp since it's easier to modify the cores being used.
-
-### llama.cpp compilation
-
-Makefile:
-
-```bash
-make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
-```
-
-CMake:
-
-```bash
-mkdir build
-cd build
-cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
-make -j
-```
-
-### llama.cpp execution
-
-According to the BLIS documentation, we could set the following
-environment variables to modify the behavior of openmp:
-
-```bash
-export GOMP_CPU_AFFINITY="0-19"
-export BLIS_NUM_THREADS=14
-```
-
-And then run the binaries as normal.
-
-
-### Intel specific issue
-
-Some might get the error message saying that `libimf.so` cannot be found.
-Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
-
-### Reference:
-
-1. https://github.com/flame/blis#getting-started
-2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -1,580 +0,0 @@
-# llama.cpp for SYCL
-
- [Background](#background)
- [Recommended Release](#recommended-release)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issues)
- [Q&A](#qa)
- [TODO](#todo)
-
-## Background
-
-**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
-
-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
-
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
-
-### Llama.cpp + SYCL
-
-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
-
-When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
-
-## Recommended Release
-
-The SYCL backend would be broken by some PRs due to no online CI.
-
-The following release is verified with good quality:
-
-|Commit ID|Tag|Release|Verified  Platform|
-|-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
-
-
-## News
-
- 2024.5
-  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
-  - Arch Linux is verified successfully.
-
- 2024.4
-  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
-
- 2024.3
-  - Release binary files of Windows.
-  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
-  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
-  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
-  - Support detecting all GPUs with level-zero and same top **Max compute units**.
-  - Support OPs
-    - hardsigmoid
-    - hardswish
-    - pool2d
-
- 2024.1
-  - Create SYCL backend for Intel GPU.
-  - Support Windows build
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
-| Windows | Support | Windows 11                                     |
-
-
-## Hardware
-
-### Intel GPU
-
-**Verified devices**
-
-| Intel GPU                     | Status  | Verified Model                        |
-|-------------------------------|---------|---------------------------------------|
-| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
-| Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
-
-*Notes:*
-
- **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
-
-  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
-
- **Execution Unit (EU)**
-  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
-
-### Other Vendor GPU
-
-**Verified devices**
-
-| Nvidia GPU               | Status  | Verified Model |
-|--------------------------|---------|----------------|
-| Ampere Series            | Support | A100, A4000    |
-| Ampere Series *(Mobile)* | Support | RTX 40 Series  |
-
-## Docker
-The docker build option is currently limited to *intel GPU* targets.
-
-### Build image
-```sh
-# Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
-```
-
-*Notes*:
-
-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
-
-You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
-
-### Run container
-
-```sh
-# First, find all the DRI cards
-ls -la /dev/dri
-# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-
-*Notes:*
- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
-## Linux
-
-### I. Setup Environment
-
-1. **Install GPU drivers**
-
-  - **Intel GPU**
-
-Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
-
-*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
-
-Once installed, add the user(s) to the `video` and `render` groups.
-
-```sh
-sudo usermod -aG render $USER
-sudo usermod -aG video $USER
-```
-
-*Note*: logout/re-login for the changes to take effect.
-
-Verify installation through `clinfo`:
-
-```sh
-sudo apt install clinfo
-sudo clinfo -l
-```
-
-Sample output:
-
-```sh
-Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
-Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
-
- **Nvidia GPU**
-
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
-
-2. **Install Intel® oneAPI Base toolkit**
-
- **For Intel GPU**
-
-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
-
-Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
-
-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-
-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
-
- **Adding support to Nvidia GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-
-
-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-cmake --build buildWithCublas --config Release
-```
-
-
-3. **Verify installation and environment**
-
-In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
-```sh
-source /opt/intel/oneapi/setvars.sh
-sycl-ls
-```
-
- **Intel GPU**
-
-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
-
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-```
-
- **Nvidia GPU**
-
-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
-```
-
-### II. Build llama.cpp
-
-#### Intel GPU
-```sh
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Build LLAMA with MKL BLAS acceleration for intel GPU
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-# build all binary
-cmake --build build --config Release -j -v
-```
-
-#### Nvidia GPU
-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
-
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-# build all binary
-cmake --build build --config Release -j -v
-
-```
-
-### III. Run the inference
-
-1. Retrieve and prepare model
-
-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
-
-2. Enable oneAPI running environment
-
-```sh
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. List devices information
-
-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
-
-```sh
-./build/bin/llama-ls-sycl-device
-```
-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
-```
-found 2 SYCL devices:
-
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-```
-
-
-4. Launch inference
-
-There are two device selection modes:
-
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
-
-| Device selection | Parameter                              |
-|------------------|----------------------------------------|
-| Single device    | --split-mode none --main-gpu DEVICE_ID |
-| Multiple devices | --split-mode layer (default)           |
-
-Examples:
-
- Use device 0:
-
-```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
-```
-or run by script:
-
-```sh
-./examples/sycl/run_llama2.sh 0
-```
-
- Use multiple devices:
-
-```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
-```
-
-Otherwise, you can run the script:
-
-```sh
-./examples/sycl/run_llama2.sh
-```
-
-*Notes:*
-
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
-
-```sh
-detect 1 SYCL GPUs: [0] with top Max compute units:512
-```
-Or
-```sh
-use 1 SYCL GPUs: [0] with Max compute units:512
-```
-
-## Windows
-
-### I. Setup Environment
-
-1. Install GPU driver
-
-Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
-
-2. Install Visual Studio
-
-If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
-
-3. Install Intel® oneAPI Base toolkit
-
-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
-
-Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
-
-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-
-b. Enable oneAPI running environment:
-
- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
-
- On the command prompt, enable the runtime environment with the following:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-c. Verify installation
-
-In the oneAPI command line, run the following to print the available SYCL devices:
-
-```
-sycl-ls
-```
-
-There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO  [31.0.101.5186]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
-```
-
-4. Install build tools
-
-a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
-b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
-
-
-### II. Build llama.cpp
-
-On the oneAPI command line window, step into the llama.cpp main directory and run the following:
-
-```
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-# Option 2: Or FP16
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
-
-cmake --build build --config Release -j
-```
-
-Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
-```sh
-.\examples\sycl\win-build-sycl.bat
-```
-
-Or, use CMake presets to build:
-```sh
-cmake --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
-
-cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
-
-cmake --preset x64-windows-sycl-debug
-cmake --build build-x64-windows-sycl-debug -j --target llama-cli
-```
-
-Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
-
-*Notes:*
-
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
-
-### III. Run the inference
-
-1. Retrieve and prepare model
-
-You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
-
-2. Enable oneAPI running environment
-
-On the oneAPI command line window, run the following and step into the llama.cpp directory:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-3. List devices information
-
-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
-
-```
-build\bin\ls-sycl-device.exe
-```
-
-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
-```
-found 2 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-
-```
-
-
-4. Launch inference
-
-There are two device selection modes:
-
- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
-
-| Device selection | Parameter                              |
-|------------------|----------------------------------------|
-| Single device    | --split-mode none --main-gpu DEVICE_ID |
-| Multiple devices | --split-mode layer (default)           |
-
-Examples:
-
- Use device 0:
-
-```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
-```
-
- Use multiple devices:
-
-```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
-```
-Otherwise, run the following wrapper script:
-
-```
-.\examples\sycl\win-run-llama2.bat
-```
-
-Note:
-
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
-
-```sh
-detect 1 SYCL GPUs: [0] with top Max compute units:512
-```
-Or
-```sh
-use 1 SYCL GPUs: [0] with Max compute units:512
-```
-
-## Environment Variable
-
-#### Build
-
-| Name               | Value                             | Function                                    |
-|--------------------|-----------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.           |
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
-| CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
-| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
-
-#### Runtime
-
-| Name              | Value            | Function                                                                                                                  |
-|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
-| GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
-| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
-
-## Known Issues
-
- `Split-mode:[row]` is not supported.
-
-## Q&A
-
- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
-
-  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
-  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
-
- General compiler error:
-
-  - Remove **build** folder or try a clean-build.
-
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
-
-  Please double-check with `sudo sycl-ls`.
-
-  If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
-
-  ```
-  sudo usermod -aG render $USER
-  sudo usermod -aG video $USER
-  ```
-  Otherwise, please double-check the GPU driver installation steps.
-
-### **GitHub contribution**:
-Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
-
-## TODO
-
- Support row layer split for multiple card runs.
--- a/docs/build.md
+++ b/docs/build.md
@ -1,340 +0,0 @@
-# Build llama.cpp locally
-
-**To get the Code:**
-
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-In order to build llama.cpp you have four different options.
-
- Using `make`:
-  - On Linux or MacOS:
-
-      ```bash
-      make
-      ```
-
-  - On Windows (x86/x64 only, arm64 requires cmake):
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
-
-  - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
-
- Using `CMake`:
-
-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
-
-  **Notes**:
-
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
-
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
-
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-        - Tab Workload: Desktop-development with C++
-        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Windows on ARM (arm64, WoA) build with:
-        ```bash
-        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-        cmake --build build-arm64-windows-llvm-release
-        ```
-        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
-
-   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-## Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
-
-## BLAS Build
-
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
-
-### Accelerate Framework:
-
-This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-
-### OpenBLAS:
-
-This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
-
- Using `make`:
-  - On Linux:
-    ```bash
-    make GGML_OPENBLAS=1
-    ```
-
-  - On Windows:
-
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-    3. Extract `w64devkit` on your pc.
-    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-    6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
-    8. From here you can run:
-
-        ```bash
-        make GGML_OPENBLAS=1
-        ```
-
- Using `CMake` on Linux:
-
-    ```bash
-    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-    cmake --build build --config Release
-    ```
-
-### BLIS
-
-Check [BLIS.md](./backend/BLIS.md) for more information.
-
-### SYCL
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
-### Intel oneMKL
-
-Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
-
- Using manual oneAPI installation:
-  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-    ```bash
-    source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
-    cmake --build build --config Release
-    ```
-
- Using oneAPI docker image:
-  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-
-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-
-### CUDA
-
-This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
- Using `make`:
-  ```bash
-  make GGML_CUDA=1
-  ```
- Using `CMake`:
-
-  ```bash
-  cmake -B build -DGGML_CUDA=ON
-  cmake --build build --config Release
-  ```
-
-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
-
-| Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-| GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-| GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-| GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-| GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
-
-### hipBLAS
-
-This provides BLAS acceleration on HIP-supported AMD GPUs.
-Make sure to have ROCm installed.
-You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
-
- Using `make`:
-  ```bash
-  make GGML_HIPBLAS=1
-  ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
-  ```bash
-  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-      && cmake --build build --config Release -- -j 16
-  ```
-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  Note that if you get the following error:
-  ```
-  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-  ```
-  Try searching for a directory under `HIP_PATH` that contains the file
-  `oclc_abi_version_400.bc`. Then, add the following to the start of the
-  command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
-  like:
-  ```bash
-  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
-  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-      && cmake --build build -- -j 16
-  ```
-
- Using `make` (example for target gfx1030, build with 16 CPU threads):
-  ```bash
-  make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
-  ```
-
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
-  ```bash
-  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
-  cmake --build build
-  ```
-  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-
-
-The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-| Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-| GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-| GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
-
-### Vulkan
-
-**Windows**
-
-#### w64devkit
-
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
-
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
-
-Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
-```sh
-SDK_VERSION=1.3.283.0
-cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
-cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
-cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
-cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
-Name: Vulkan-Loader
-Description: Vulkan Loader
-Version: $SDK_VERSION
-Libs: -lvulkan-1
-EOF
-
-```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
-
-#### MSYS2
-Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
-  pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
-  ```
-Switch into `llama.cpp` directory and build using CMake.
-```sh
-cmake -B build -DGGML_VULKAN=ON
-cmake --build build --config Release
-```
-
-**With docker**:
-
-You don't need to install Vulkan SDK. It will be installed inside the container.
-
-```sh
-# Build the image
-docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
-
-# Then, use it:
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-
-**Without docker**:
-
-Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
-
-For example, on Ubuntu 22.04 (jammy), use the command below:
-
-```bash
-wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
-wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-apt update -y
-apt-get install -y vulkan-sdk
-# To verify the installation, use the command below:
-vulkaninfo
-```
-
-Alternatively your package manager might be able to provide the appropriate libraries.
-For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
-For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
-
-Then, build llama.cpp using the cmake command below:
-
-```bash
-cmake -B build -DGGML_VULKAN=1
-cmake --build build --config Release
-# Test the output binary (with "-ngl 33" to offload all layers to GPU)
-./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
-
-# You should see in the output, ggml_vulkan detected your GPU. For example:
-# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
-```
-
-### Android
-
-To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -1,119 +0,0 @@
-# Add a new model architecture to `llama.cpp`
-
-Adding a model requires few steps:
-
-1. Convert the model to GGUF
-2. Define the model architecture in `llama.cpp`
-3. Build the GGML graph implementation
-
-After following these steps, you can open PR.
-
-Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
-
-### 1. Convert the model to GGUF
-
-This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
-
-The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
-
-The required steps to implement for an HF model are:
-
-1. Define the model `Model.register` annotation in a new `Model` subclass, example:
-
-```python
-@Model.register("MyModelForCausalLM")
-class MyModel(Model):
-    model_arch = gguf.MODEL_ARCH.GROK
-```
-
-2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
-
-Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
-
-Example for `falcon` model:
-```python
-    MODEL_ARCH.FALCON: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_NORM_2,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ]
-```
-
-3. Map the original tensor names to the standardize equivalent in GGUF
-
-As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
-
-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
-
-If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
-
-Example for the normalization tensor in attention layers:
-
-```python
-block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
-        # Attention norm
-        MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
-            "transformer.blocks.{bid}.norm_1",                      # mpt
-            ...
-        )
-}
-```
-
-`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
-
-Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
- `Model#set_gguf_parameters`
- `Model#set_vocab`
- `Model#write_tensors`
-
-NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
-
-### 2. Define the model architecture in `llama.cpp`
-
-The model params and tensors layout must be defined in `llama.cpp`:
-1. Define a new `llm_arch`
-2. Define the tensors layout in `LLM_TENSOR_NAMES`
-3. Add any non standard metadata in `llm_load_hparams`
-4. Create the tensors for inference in `llm_load_tensors`
-5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
-
-NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
-
-### 3. Build the GGML graph implementation
-
-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
-
-Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
-
-When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
-
-Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
-
-## GGUF specification
-
-https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
-
-## Resources
-
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
--- a/docs/development/debugging-tests.md
+++ b/docs/development/debugging-tests.md
@ -1,104 +0,0 @@
-# Debugging Tests Tips
-
-## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
-
-There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
-
-For example, running the following command will output an interactive list from which you can select a test. It takes this form:
-
-`debug-test.sh [OPTION]... <test_regex> <test_number>`
-
-It will then build & run in the debugger for you.
-
-To just execute a test and get back a PASS or FAIL message run:
-
-```bash
-./scripts/debug-test.sh test-tokenizer
-```
-
-To test in GDB use the `-g` flag to enable gdb test mode.
-
-```bash
-./scripts/debug-test.sh -g test-tokenizer
-
-# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
->>> b main
-```
-
-To speed up the testing loop, if you know your test number you can just run it similar to below:
-
-```bash
-./scripts/debug-test.sh test 23
-```
-
-For further reference use `debug-test.sh -h` to print help.
-
-&nbsp;
-
-### How does the script work?
-If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
-
-#### Step 1: Reset and Setup folder context
-
-From base of this repository, let's create `build-ci-debug` as our build context.
-
-```bash
-rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-```
-
-#### Step 2: Setup Build Environment and Compile Test Binaries
-
-Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
-
-```bash
-cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
-make -j
-```
-
-#### Step 3: Find all tests available that matches REGEX
-
-The output of this command will give you the command & arguments needed to run GDB.
-
-* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
-* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
-* `-V` : Verbose Mode
-
-```bash
-ctest -R "test-tokenizer" -V -N
-```
-
-This may return output similar to below (focusing on key lines to pay attention to):
-
-```bash
-...
-1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-1: Working Directory: .
-Labels: main
-  Test  #1: test-tokenizer-0-llama-spm
-...
-4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
-4: Working Directory: .
-Labels: main
-  Test  #4: test-tokenizer-0-falcon
-...
-```
-
-#### Step 4: Identify Test Command for Debugging
-
-So for test #1 above we can tell these two pieces of relevant information:
-* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
-* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
-
-#### Step 5: Run GDB on test command
-
-Based on the ctest 'test command' report above we can then run a gdb session via this command below:
-
-```bash
-gdb --args ${Test Binary} ${Test GGUF Model}
-```
-
-Example:
-
-```bash
-gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-```
--- a/docs/development/llama-star/idea-arch.key
+++ b/docs/development/llama-star/idea-arch.key
--- a/docs/development/llama-star/idea-arch.pdf
+++ b/docs/development/llama-star/idea-arch.pdf
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@ -1,40 +0,0 @@
-# Token generation performance troubleshooting
-
-## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
-```shell
-./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
-```
-
-When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
-```shell
-llama_model_load_internal: [cublas] offloading 60 layers to GPU
-llama_model_load_internal: [cublas] offloading output layer to GPU
-llama_model_load_internal: [cublas] total VRAM used: 17223 MB
-... rest of inference
-```
-
-If you see these lines, then the GPU is being used.
-
-## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
-
-# Example of runtime flags effect on inference speed benchmark
-These runs were tested on the following machine:
-GPU: A6000 (48GB VRAM)
-CPU: 7 physical cores
-RAM: 32GB
-
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-
-Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
-
-Result:
-
-| command | tokens/second (higher is better) |
-| - | - |
-| -ngl 2000000 | N/A (less than 0.1) |
-| -t 7 | 1.7 |
-| -t 1 -ngl 2000000 | 5.5 |
-| -t 7 -ngl 2000000 | 8.7 |
-| -t 4 -ngl 2000000 | 9.1 |
--- a/docs/docker.md
+++ b/docs/docker.md
@ -1,86 +0,0 @@
-# Docker
-
-## Prerequisites
-* Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
-
-## Images
-We have three Docker images available for this project:
-
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
-
-Additionally, there the following images, similar to the above:
-
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
-
-## Usage
-
-The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
-
-Replace `/path/to/models` below with the actual path where you downloaded the models.
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
-```
-
-On completion, you are ready to play!
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a light image:
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a server image:
-
-```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
-```
-
-## Docker With CUDA
-
-Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
-
-## Building Docker locally
-
-```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
-```
-
-You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
-
-The defaults are:
-
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
-
-The resulting images, are essentially the same as the non-CUDA images:
-
-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
-
-## Usage
-
-After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
-
-```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
-```
--- a/docs/install.md
+++ b/docs/install.md
@ -1,39 +0,0 @@
-# Install pre-built version of llama.cpp
-
-## Homebrew
-
-On Mac and Linux, the homebrew package manager can be used via
-
-```sh
-brew install llama.cpp
-```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
-
-## Nix
-
-On Mac and Linux, the Nix package manager can be used via
-
-```sh
-nix profile install nixpkgs#llama-cpp
-```
-For flake enabled installs.
-
-Or
-
-```sh
-nix-env --file '<nixpkgs>' --install --attr llama-cpp
-```
-
-For non-flake enabled installs.
-
-This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
-
-## Flox
-
-On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
-
-```sh
-flox install llama-cpp
-```
-
-Flox follows the nixpkgs build of llama.cpp.
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-
-AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
-USER_NAME="${USER_NAME:-Anon}"
-
-# Uncomment and adjust to the number of CPU cores you want to use.
-#N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
-N_PREDICTS="${N_PREDICTS:-4096}"
-
-GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
-
-if [ -n "$N_THREAD" ]; then
-    GEN_OPTIONS+=(--threads "$N_THREAD")
-fi
-
-./llama-cli "${GEN_OPTIONS[@]}" \
-    --model "$MODEL" \
-    --in-prefix " " \
-    --in-suffix "${AI_NAME}:" \
-    --n_predict "$N_PREDICTS" \
-    --color --interactive \
-    --reverse-prompt "${USER_NAME}:" \
-    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
-${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
-${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
-${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
-${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}.
-The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
-${AI_NAME} can only communicate through text, so she can't send images or videos.
-
-
-${USER_NAME}: Hello!
-${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
-${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
-${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
-${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
-${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
-${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
-${AI_NAME}: What do you like to do in your free time? ^_^
-${USER_NAME}:" "$@"
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@ -1,57 +0,0 @@
-@setlocal disabledelayedexpansion enableextensions
-@echo off
-
-cd /d "%~dp0.."
-if not "%errorlevel%"=="0" (
-    echo Unable to change directory.
-    pause
-    exit /b 1
-)
-
-if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
-if not defined USER_NAME set "USER_NAME=User"
-if not defined AI_NAME set "AI_NAME=ChatLLaMa"
-rem Adjust to the number of CPU cores you want to use.
-rem if not defined N_THREAD set "N_THREAD=8"
-rem Number of tokens to predict (made it larger than default because we want a long interaction)
-if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
-
-rem Default main script paths
-set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
-
-rem Get main script path from command line arguments
-set "MAIN_SCRIPT_PATH=%~1"
-
-rem If the main script path was not specified, try the default paths
-if not defined MAIN_SCRIPT_PATH (
-    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
-        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
-    )
-)
-
-rem If the main script path was not found, tell the user how to specify it
-if not defined MAIN_SCRIPT_PATH (
-    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
-    echo %DEFAULT_MAIN_SCRIPT_PATHS%
-    pause
-    exit /b 1
-)
-
-rem Default context, feel free to edit it
-set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
-
-rem Set a temporary variable if N_THREAD is set
-if defined N_THREAD (
-    set "_N_THREAD=--threads %N_THREAD%"
-) else (
-    set "_N_THREAD="
-)
-
-rem Run the script
-echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
-  --model "%MODEL%" ^
-  --n_predict %N_PREDICTS% ^
-  --color --interactive ^
-  --reverse-prompt "%USER_NAME%:" ^
-  --prompt "%PROMPT_TEXT%"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="${USER_NAME:-USER}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "${USER_NAME}:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -1,151 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-cd "$(dirname "$0")/.." || exit
-
-if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
-    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
-    exit 1
-fi
-
-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
-PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
-USER_NAME="${USER_NAME:-User}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-DATE_TIME="$(date +%H:%M)"
-DATE_YEAR="$(date +%Y)"
-
-LOG="${CHAT_SAVE_DIR}/main.log"
-LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
-CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
-CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
-NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
-NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
-
-SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
-SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
-SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
-
-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
-
-# An unbuffered `tail -c+N`
-skip_bytes() {
-    LANG=C IFS= read -r -n "$1" -d '' c
-    while LANG=C IFS= read -r -n 1 -d '' c; do
-        printf '%s' "$c"
-    done
-}
-
-mkdir -p "$CHAT_SAVE_DIR"
-echo >"$LOG"
-trap "tail -n100 ${LOG}" EXIT
-
-if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
-    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
-        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
-fi
-
-if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
-    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-fi
-
-if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
-    echo '...' >>"$NEXT_PROMPT_FILE"
-fi
-
-if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
-    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama-cli 2>>"$LOG" \
-        --batch_size 64 \
-        "${OPTS[@]}" \
-        --prompt-cache "$PROMPT_CACHE_FILE" \
-        --file "$CUR_PROMPT_FILE" \
-        --n_predict 1
-    echo
-    echo 'Done!'
-fi
-
-if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
-fi
-if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-fi
-
-printf '%s ' "$(< "$CUR_PROMPT_FILE")"
-n_tokens=0
-
-while read -e line; do
-    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
-
-    # Swap prompts when we're about to run out of context
-    if ((n_predict <= 0)); then
-        wait # for background main (below) to finish with next prompt
-        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
-        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
-
-        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-        echo '...' >>"$NEXT_PROMPT_FILE"
-        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-
-        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
-    fi
-
-    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        echo " ${line}" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
-
-    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
-
-    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
-            --prompt-cache "$CUR_PROMPT_CACHE" \
-            --prompt-cache-all \
-            --file "$CUR_PROMPT_FILE" \
-            --reverse-prompt "${USER_NAME}:" \
-            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
-        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
-        skip_bytes "$n_prompt_len_pre"  # print generation
-
-    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
-
-    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
-    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
-        printf '\n%s:' "$USER_NAME"
-        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
-    fi
-
-    printf ' '
-
-    # HACK get num tokens from debug message
-    # TODO get both messages in one go
-    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
-        exit 1
-    fi
-
-    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
-
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    # Update cache for next prompt in background, ideally during user input
-    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
-          --prompt-cache "$NEXT_PROMPT_CACHE" \
-          --file "$NEXT_PROMPT_FILE" \
-          --n_predict 1 &
-done
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="### Human"
-AI_NAME="### Assistant"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "### Human:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat.sh
+++ b/examples/chat.sh
@ -1,16 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-# Important:
-#
-#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
-#
-./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
-    --repeat_penalty 1.0 --color -i \
-    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@ -1,82 +0,0 @@
-# Usage:
-#! ./llama-server -m some-model.gguf &
-#! pip install pydantic
-#! python json_schema_pydantic_example.py
-
-from pydantic import BaseModel, Field, TypeAdapter
-from annotated_types import MinLen
-from typing import Annotated, List, Optional
-import json, requests
-
-if True:
-
-    def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
-        '''
-        Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
-        (llama.cpp server, llama-cpp-python, Anyscale / Together...)
-
-        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
-        '''
-        response_format = None
-        type_adapter = None
-
-        if response_model:
-            type_adapter = TypeAdapter(response_model)
-            schema = type_adapter.json_schema()
-            messages = [{
-                "role": "system",
-                "content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
-            }] + messages
-            response_format={"type": "json_object", "schema": schema}
-
-        data = requests.post(endpoint, headers={"Content-Type": "application/json"},
-                             json=dict(messages=messages, response_format=response_format, **kwargs)).json()
-        if 'error' in data:
-            raise Exception(data['error']['message'])
-
-        content = data["choices"][0]["message"]["content"]
-        return type_adapter.validate_json(content) if type_adapter else content
-
-else:
-
-    # This alternative branch uses Instructor + OpenAI client lib.
-    # Instructor support streamed iterable responses, retry & more.
-    # (see https://python.useinstructor.com/)
-    #! pip install instructor openai
-    import instructor, openai
-    client = instructor.patch(
-        openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
-        mode=instructor.Mode.JSON_SCHEMA)
-    create_completion = client.chat.completions.create
-
-
-if __name__ == '__main__':
-
-    class QAPair(BaseModel):
-        class Config:
-            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
-        question: str
-        concise_answer: str
-        justification: str
-        stars: Annotated[int, Field(ge=1, le=5)]
-
-    class PyramidalSummary(BaseModel):
-        class Config:
-            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
-        title: str
-        summary: str
-        question_answers: Annotated[List[QAPair], MinLen(2)]
-        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
-
-    print("# Summary\n", create_completion(
-        model="...",
-        response_model=PyramidalSummary,
-        messages=[{
-            "role": "user",
-            "content": f"""
-                You are a highly efficient corporate document summarizer.
-                Create a pyramidal summary of an imaginary internal document about our company processes
-                (starting high-level, going down to each sub sections).
-                Keep questions short, and answers even shorter (trivia / quizz style).
-            """
-        }]))
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -1,811 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import itertools
-import json
-import re
-import sys
-from typing import Any, List, Optional, Set, Tuple, Union
-
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
-
-    if min_items == 0 and max_items == 1:
-        return f'{item_rule}?'
-
-    if not separator_rule:
-        if min_items == 1 and max_items is None:
-            return f'{item_rule}+'
-        elif min_items == 0 and max_items is None:
-            return f'{item_rule}*'
-        else:
-            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
-
-    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
-    return f'({result})?' if min_items == 0 else result
-
-def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
-    has_min = min_value != None
-    has_max = max_value != None
-
-    def digit_range(from_char: str, to_char: str):
-        out.append("[")
-        if from_char == to_char:
-            out.append(from_char)
-        else:
-            out.append(from_char)
-            out.append("-")
-            out.append(to_char)
-        out.append("]")
-
-    def more_digits(min_digits: int, max_digits: int):
-        out.append("[0-9]")
-        if min_digits == max_digits and min_digits == 1:
-            return
-        out.append("{")
-        out.append(str(min_digits))
-        if max_digits != min_digits:
-            out.append(",")
-            if max_digits != sys.maxsize:
-                out.append(str(max_digits))
-        out.append("}")
-
-    def uniform_range(from_str: str, to_str: str):
-        i = 0
-        while i < len(from_str) and from_str[i] == to_str[i]:
-            i += 1
-        if i > 0:
-            out.append("\"")
-            out.append(from_str[:i])
-            out.append("\"")
-        if i < len(from_str):
-            if i > 0:
-                out.append(" ")
-            sub_len = len(from_str) - i - 1
-            if sub_len > 0:
-                from_sub = from_str[i+1:]
-                to_sub = to_str[i+1:]
-                sub_zeros = "0" * sub_len
-                sub_nines = "9" * sub_len
-
-                to_reached = False
-                out.append("(")
-                if from_sub == sub_zeros:
-                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
-                    out.append(" ")
-                    more_digits(sub_len, sub_len)
-                else:
-                    out.append("[")
-                    out.append(from_str[i])
-                    out.append("] ")
-                    out.append("(")
-                    uniform_range(from_sub, sub_nines)
-                    out.append(")")
-                    if ord(from_str[i]) < ord(to_str[i]) - 1:
-                        out.append(" | ")
-                        if to_sub == sub_nines:
-                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
-                            to_reached = True
-                        else:
-                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
-                        out.append(" ")
-                        more_digits(sub_len, sub_len)
-                if not to_reached:
-                    out.append(" | ")
-                    digit_range(to_str[i], to_str[i])
-                    out.append(" ")
-                    uniform_range(sub_zeros, to_sub)
-                out.append(")")
-            else:
-                out.append("[")
-                out.append(from_str[i])
-                out.append("-")
-                out.append(to_str[i])
-                out.append("]")
-
-    if has_min and has_max:
-        if min_value < 0 and max_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
-            out.append(")")
-            return
-
-        if min_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
-            out.append(") | ")
-            min_value = 0
-
-        min_s = str(min_value)
-        max_s = str(max_value)
-        min_digits = len(min_s)
-        max_digits = len(max_s)
-
-        for digits in range(min_digits, max_digits):
-            uniform_range(min_s, "9" * digits)
-            min_s = "1" + "0" * digits
-            out.append(" | ")
-        uniform_range(min_s, max_s)
-        return
-
-    less_decimals = max(decimals_left - 1, 1)
-
-    if has_min:
-        if min_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
-            out.append(") | [0] | [1-9] ")
-            more_digits(0, decimals_left - 1)
-        elif min_value == 0:
-            if top_level:
-                out.append("[0] | [1-9] ")
-                more_digits(0, less_decimals)
-            else:
-                more_digits(1, decimals_left)
-        elif min_value <= 9:
-            c = str(min_value)
-            range_start = '1' if top_level else '0'
-            if c > range_start:
-                digit_range(range_start, chr(ord(c) - 1))
-                out.append(" ")
-                more_digits(1, less_decimals)
-                out.append(" | ")
-            digit_range(c, "9")
-            out.append(" ")
-            more_digits(0, less_decimals)
-        else:
-            min_s = str(min_value)
-            length = len(min_s)
-            c = min_s[0]
-
-            if c > "1":
-                digit_range("1" if top_level else "0", chr(ord(c) - 1))
-                out.append(" ")
-                more_digits(length, less_decimals)
-                out.append(" | ")
-            digit_range(c, c)
-            out.append(" (")
-            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
-            out.append(")")
-            if c < "9":
-                out.append(" | ")
-                digit_range(chr(ord(c) + 1), "9")
-                out.append(" ")
-                more_digits(length - 1, less_decimals)
-        return
-
-    if has_max:
-        if max_value >= 0:
-            if top_level:
-                out.append("\"-\" [1-9] ")
-                more_digits(0, less_decimals)
-                out.append(" | ")
-            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
-        else:
-            out.append("\"-\" (")
-            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
-            out.append(")")
-        return
-
-    raise RuntimeError("At least one of min_value or max_value must be set")
-
-class BuiltinRule:
-    def __init__(self, content: str, deps: list | None = None):
-        self.content = content
-        self.deps = deps or []
-
-# Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
-
-PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
-    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
-    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
-}
-
-# TODO: support "uri", "email" string formats
-STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
-}
-
-DOTALL = '[\\U00000000-\\U0010FFFF]'
-DOT = '[^\\x0A\\x0D]'
-
-RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
-
-INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
-GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
-
-NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
-
-
-class SchemaConverter:
-    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
-        self._prop_order = prop_order
-        self._allow_fetch = allow_fetch
-        self._dotall = dotall
-        self._raw_pattern = raw_pattern
-        self._rules = {
-            'space': SPACE_RULE,
-        }
-        self._refs = {}
-        self._refs_being_resolved = set()
-
-    def _format_literal(self, literal):
-        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
-        )
-        return f'"{escaped}"'
-
-    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
-        '''
-            not_literal('a') -> '[^a]'
-            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
-        '''
-        assert len(literal) > 0, 'Empty literal not supported'
-        def recurse(i: int):
-            c = literal[i]
-            if maybe_escaped_underscores and c == '_':
-                yield f'[^{c}\\\\]'
-                yield ' | '
-                yield f'"\\\\"? "{c}"'
-            else:
-                yield f'[^{c}]'
-            if i < len(literal) - 1:
-                yield ' | '
-                yield self._format_literal(c)
-                yield ' ('
-                yield from recurse(i + 1)
-                yield ')?'
-
-        return ''.join(('(', *recurse(0), ')'))
-
-    def _not_strings(self, strings):
-        class TrieNode:
-            def __init__(self):
-                self.children = {}
-                self.is_end_of_string = False
-
-            def insert(self, string):
-                node = self
-                for c in string:
-                    node = node.children.setdefault(c, TrieNode())
-                node.is_end_of_string = True
-
-        trie = TrieNode()
-        for s in strings:
-            trie.insert(s)
-
-        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
-        out = ['["] ( ']
-
-        def visit(node):
-            rejects = []
-            first = True
-            for c in sorted(node.children.keys()):
-                child = node.children[c]
-                rejects.append(c)
-                if first:
-                    first = False
-                else:
-                    out.append(' | ')
-                out.append(f'[{c}]')
-                if child.children:
-                    out.append(f' (')
-                    visit(child)
-                    out.append(')')
-                elif child.is_end_of_string:
-                    out.append(f' {char_rule}+')
-            if node.children:
-                if not first:
-                    out.append(' | ')
-                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
-        visit(trie)
-
-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
-        return ''.join(out)
-
-    def _add_rule(self, name, rule):
-        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
-        if esc_name not in self._rules or self._rules[esc_name] == rule:
-            key = esc_name
-        else:
-            i = 0
-            while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
-                i += 1
-            key = f'{esc_name}{i}'
-        self._rules[key] = rule
-        return key
-
-    def resolve_refs(self, schema: dict, url: str):
-        '''
-            Resolves all $ref fields in the given schema, fetching any remote schemas,
-            replacing $ref with absolute reference URL and populating self._refs with the
-            respective referenced (sub)schema dictionaries.
-        '''
-        def visit(n: dict):
-            if isinstance(n, list):
-                return [visit(x) for x in n]
-            elif isinstance(n, dict):
-                ref = n.get('$ref')
-                if ref is not None and ref not in self._refs:
-                    if ref.startswith('https://'):
-                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
-                        import requests
-
-                        frag_split = ref.split('#')
-                        base_url = frag_split[0]
-
-                        target = self._refs.get(base_url)
-                        if target is None:
-                            target = self.resolve_refs(requests.get(ref).json(), base_url)
-                            self._refs[base_url] = target
-
-                        if len(frag_split) == 1 or frag_split[-1] == '':
-                            return target
-                    elif ref.startswith('#/'):
-                        target = schema
-                        ref = f'{url}{ref}'
-                        n['$ref'] = ref
-                    else:
-                        raise ValueError(f'Unsupported ref {ref}')
-
-                    for sel in ref.split('#')[-1].split('/')[1:]:
-                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
-                        target = target[sel]
-
-                    self._refs[ref] = target
-                else:
-                    for v in n.values():
-                        visit(v)
-
-            return n
-        return visit(schema)
-
-    def _generate_union_rule(self, name, alt_schemas):
-        return ' | '.join((
-            self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
-            for i, alt_schema in enumerate(alt_schemas)
-        ))
-
-    def _visit_pattern(self, pattern, name):
-        '''
-            Transforms a regular expression pattern into a GBNF rule.
-
-            Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
-
-            Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
-
-            Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
-            we define sub-rules to keep the output lean.
-        '''
-
-        assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
-        pattern = pattern[1:-1]
-        sub_rule_ids = {}
-
-        i = 0
-        length = len(pattern)
-
-        def to_rule(s: tuple[str, bool]) -> str:
-            (txt, is_literal) = s
-            return "\"" + txt + "\"" if is_literal else txt
-
-        def transform() -> tuple[str, bool]:
-            '''
-                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
-            '''
-            nonlocal i
-            nonlocal pattern
-            nonlocal sub_rule_ids
-
-            start = i
-            # For each component of this sequence, store its string representation and whether it's a literal.
-            # We only need a flat structure here to apply repetition operators to the last item, and
-            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
-            # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[tuple[str, bool]] = []
-
-            def get_dot():
-                if self._dotall:
-                    rule = DOTALL
-                else:
-                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = DOT
-                return self._add_rule(f'dot', rule)
-
-            def join_seq():
-                nonlocal seq
-                ret = []
-                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
-                    if is_literal:
-                        ret.append((''.join(x[0] for x in g), True))
-                    else:
-                        ret.extend(g)
-                if len(ret) == 1:
-                    return ret[0]
-                return (' '.join(to_rule(x) for x in seq), False)
-
-            while i < length:
-                c = pattern[i]
-                if c == '.':
-                    seq.append((get_dot(), False))
-                    i += 1
-                elif c == '(':
-                    i += 1
-                    if i < length:
-                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
-                    seq.append((f'({to_rule(transform())})', False))
-                elif c == ')':
-                    i += 1
-                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
-                    return join_seq()
-                elif c == '[':
-                    square_brackets = c
-                    i += 1
-                    while i < length and pattern[i] != ']':
-                        if pattern[i] == '\\':
-                            square_brackets += pattern[i:i+2]
-                            i += 2
-                        else:
-                            square_brackets += pattern[i]
-                            i += 1
-                    assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
-                    square_brackets += ']'
-                    i += 1
-                    seq.append((square_brackets, False))
-                elif c == '|':
-                    seq.append(('|', False))
-                    i += 1
-                elif c in ('*', '+', '?'):
-                    seq[-1] = (to_rule(seq[-1]) + c, False)
-                    i += 1
-                elif c == '{':
-                    curly_brackets = c
-                    i += 1
-                    while i < length and pattern[i] != '}':
-                        curly_brackets += pattern[i]
-                        i += 1
-                    assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
-                    curly_brackets += '}'
-                    i += 1
-                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
-                    min_times = 0
-                    max_times = None
-                    try:
-                        if len(nums) == 1:
-                            min_times = int(nums[0])
-                            max_times = min_times
-                        else:
-                            assert len(nums) == 2
-                            min_times = int(nums[0]) if nums[0] else 0
-                            max_times = int(nums[1]) if nums[1] else None
-                    except ValueError:
-                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
-
-                    (sub, sub_is_literal) = seq[-1]
-
-                    if not sub_is_literal:
-                        id = sub_rule_ids.get(sub)
-                        if id is None:
-                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
-                            sub_rule_ids[sub] = id
-                        sub = id
-
-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
-                else:
-                    literal = ''
-                    while i < length:
-                        if pattern[i] == '\\' and i < length - 1:
-                            next = pattern[i + 1]
-                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
-                                i += 1
-                                literal += pattern[i]
-                                i += 1
-                            else:
-                                literal += pattern[i:i+2]
-                                i += 2
-                        elif pattern[i] == '"' and not self._raw_pattern:
-                            literal += '\\"'
-                            i += 1
-                        elif pattern[i] not in NON_LITERAL_SET and \
-                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
-                            literal += pattern[i]
-                            i += 1
-                        else:
-                            break
-                    if literal:
-                        seq.append((literal, True))
-
-            return join_seq()
-
-        return self._add_rule(
-            name,
-            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
-
-
-    def _resolve_ref(self, ref):
-        ref_name = ref.split('/')[-1]
-        if ref_name not in self._rules and ref not in self._refs_being_resolved:
-            self._refs_being_resolved.add(ref)
-            resolved = self._refs[ref]
-            ref_name = self.visit(resolved, ref_name)
-            self._refs_being_resolved.remove(ref)
-        return ref_name
-
-    def _generate_constant_rule(self, value):
-        return self._format_literal(json.dumps(value))
-
-    def visit(self, schema, name):
-        schema_type = schema.get('type')
-        schema_format = schema.get('format')
-        rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'
-
-        if (ref := schema.get('$ref')) is not None:
-            return self._add_rule(rule_name, self._resolve_ref(ref))
-
-        elif 'oneOf' in schema or 'anyOf' in schema:
-            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
-
-        elif isinstance(schema_type, list):
-            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
-
-        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
-
-        elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
-            return self._add_rule(rule_name, rule)
-
-        elif schema_type in (None, 'object') and \
-             ('properties' in schema or \
-              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
-            required = set(schema.get('required', []))
-            properties = list(schema.get('properties', {}).items())
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
-
-        elif schema_type in (None, 'object') and 'allOf' in schema:
-            required = set()
-            properties = []
-            hybrid_name = name
-            def add_component(comp_schema, is_required):
-                if (ref := comp_schema.get('$ref')) is not None:
-                    comp_schema = self._refs[ref]
-
-                if 'properties' in comp_schema:
-                    for prop_name, prop_schema in comp_schema['properties'].items():
-                        properties.append((prop_name, prop_schema))
-                        if is_required:
-                            required.add(prop_name)
-
-            for t in schema['allOf']:
-                if 'anyOf' in t:
-                    for tt in t['anyOf']:
-                        add_component(tt, is_required=False)
-                else:
-                    add_component(t, is_required=True)
-
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
-
-        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
-            items = schema.get('items') or schema['prefixItems']
-            if isinstance(items, list):
-                return self._add_rule(
-                    rule_name,
-                    '"[" space ' +
-                    ' "," space '.join(
-                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
-                        for i, item in enumerate(items)) +
-                    ' "]" space')
-            else:
-                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
-                min_items = schema.get("minItems", 0)
-                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
-
-        elif schema_type in (None, 'string') and 'pattern' in schema:
-            return self._visit_pattern(schema['pattern'], rule_name)
-
-        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_primitive(
-                'root' if rule_name == 'root' else schema_format,
-                PRIMITIVE_RULES['uuid']
-            )
-
-        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
-            prim_name = f'{schema_format}-string'
-            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
-
-        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
-            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
-            min_len = schema.get('minLength', 0)
-            max_len = schema.get('maxLength')
-
-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
-
-        elif schema_type in (None, 'integer') and \
-                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
-            min_value = None
-            max_value = None
-            if 'minimum' in schema:
-                min_value = schema['minimum']
-            elif 'exclusiveMinimum' in schema:
-                min_value = schema['exclusiveMinimum'] + 1
-            if 'maximum' in schema:
-                max_value = schema['maximum']
-            elif 'exclusiveMaximum' in schema:
-                max_value = schema['exclusiveMaximum'] - 1
-
-            out = ["("]
-            _generate_min_max_int(min_value, max_value, out)
-            out.append(") space")
-            return self._add_rule(rule_name, ''.join(out))
-
-        elif (schema_type == 'object') or (len(schema) == 0):
-            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
-
-        else:
-            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
-            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
-
-    def _add_primitive(self, name: str, rule: BuiltinRule):
-        n = self._add_rule(name, rule.content)
-
-        for dep in rule.deps:
-            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
-            assert dep_rule, f'Rule {dep} not known'
-            if dep not in self._rules:
-                self._add_primitive(dep, dep_rule)
-        return n
-
-    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
-        prop_order = self._prop_order
-        # sort by position in prop_order (if specified) then by original order
-        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
-
-        prop_kv_rule_names = {}
-        for prop_name, prop_schema in properties:
-            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
-            prop_kv_rule_names[prop_name] = self._add_rule(
-                f'{name}{"-" if name else ""}{prop_name}-kv',
-                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
-            )
-        required_props = [k for k in sorted_props if k in required]
-        optional_props = [k for k in sorted_props if k not in required]
-
-        if additional_properties is not None and additional_properties != False:
-            sub_name = f'{name}{"-" if name else ""}additional'
-            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
-                self._add_primitive('value', PRIMITIVE_RULES['value'])
-            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
-                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
-
-            prop_kv_rule_names["*"] = self._add_rule(
-                f'{sub_name}-kv',
-                f'{key_rule} ":" space {value_rule}'
-            )
-            optional_props.append("*")
-
-        rule = '"{" space '
-        rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
-
-        if optional_props:
-            rule += ' ('
-            if required_props:
-                rule += ' "," space ( '
-
-            def get_recursive_refs(ks, first_is_optional):
-                [k, *rest] = ks
-                kv_rule_name = prop_kv_rule_names[k]
-                comma_ref = f'( "," space {kv_rule_name} )'
-                if first_is_optional:
-                    res = comma_ref + ('*' if k == '*' else '?')
-                else:
-                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
-                if len(rest) > 0:
-                    res += ' ' + self._add_rule(
-                        f'{name}{"-" if name else ""}{k}-rest',
-                        get_recursive_refs(rest, first_is_optional=True)
-                    )
-                return res
-
-            rule += ' | '.join(
-                get_recursive_refs(optional_props[i:], first_is_optional=False)
-                for i in range(len(optional_props))
-            )
-            if required_props:
-                rule += ' )'
-            rule += ' )?'
-
-        rule += ' "}" space'
-
-        return rule
-
-    def format_grammar(self):
-        return '\n'.join(
-            f'{name} ::= {rule}'
-            for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
-        )
-
-
-def main(args_in = None):
-    parser = argparse.ArgumentParser(
-        description='''
-            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
-            given JSON schema. Only a subset of JSON schema features are supported; more may be
-            added in the future.
-        ''',
-    )
-    parser.add_argument(
-        '--prop-order',
-        default=[],
-        type=lambda s: s.split(','),
-        help='''
-            comma-separated property names defining the order of precedence for object properties;
-            properties not specified here are given lower precedence than those that are, and
-            are kept in their original order from the schema. Required properties are always
-            given precedence over optional properties.
-        '''
-    )
-    parser.add_argument(
-        '--allow-fetch',
-        action='store_true',
-        default=False,
-        help='Whether to allow fetching referenced schemas over HTTPS')
-    parser.add_argument(
-        '--dotall',
-        action='store_true',
-        default=False,
-        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
-    parser.add_argument(
-        '--raw-pattern',
-        action='store_true',
-        default=False,
-        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
-
-    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
-    args = parser.parse_args(args_in)
-
-    if args.schema.startswith('https://'):
-        url = args.schema
-        import requests
-        schema = requests.get(url).json()
-    elif args.schema == '-':
-        url = 'stdin'
-        schema = json.load(sys.stdin)
-    else:
-        url = f'file://{args.schema}'
-        with open(args.schema) as f:
-            schema = json.load(f)
-    converter = SchemaConverter(
-        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
-        allow_fetch=args.allow_fetch,
-        dotall=args.dotall,
-        raw_pattern=args.raw_pattern)
-    schema = converter.resolve_refs(schema, url)
-    converter.visit(schema, '')
-    print(converter.format_grammar())
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/llama.vim
+++ b/examples/llama.vim
@ -1,135 +0,0 @@
-" Requires an already running llama.cpp server
-" To install either copy or symlink to ~/.vim/autoload/llama.vim
-" Then start with either :call llama#doLlamaGen(),
-" or add a keybind to your vimrc such as
-" nnoremap Z :call llama#doLlamaGen()<CR>
-" Similarly, you could add an insert mode keybind with
-" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
-"
-" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
-" let g:llama_api_url = "192.168.1.10:8080"
-" llama_overrides can also be set through buffer/window scopes. For instance
-" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
-" Could be added to your .vimrc to automatically set a lower temperature when
-" editing a python script
-" Additionally, an override dict can be stored at the top of a file
-" !*{"stop": ["User:"]}
-" Could be added to the start of your chatlog.txt to set the stopping token
-" These parameter dicts are merged together from lowest to highest priority:
-" server default -> g:llama_overrides -> w:llama_overrides ->
-" b:llama_overrides -> in file (!*) overrides
-"
-" Sublists (like logit_bias and stop) are overridden, not merged
-" Example override:
-" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
-if !exists("g:llama_api_url")
-    let g:llama_api_url= "127.0.0.1:8080"
-endif
-if !exists("g:llama_overrides")
-   let g:llama_overrides = {}
-endif
-const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
-const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
-let s:linedict = {}
-
-func s:callbackHandler(bufn, channel, msg)
-   if len(a:msg) < 3
-      return
-   elseif a:msg[0] == "d"
-      let l:msg = a:msg[6:-1]
-   else
-      let l:msg = a:msg
-   endif
-   let l:decoded_msg = json_decode(l:msg)
-   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
-   if len(l:newtext) > 0
-      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
-   else
-      echo "nothing genned"
-   endif
-   if len(newtext) > 1
-      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
-      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
-   endif
-   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
-       echo "Finished generation"
-   endif
-endfunction
-
-func llama#doLlamaGen()
-   if exists("b:job")
-      if job_status(b:job) == "run"
-         call job_stop(b:job)
-         return
-      endif
-   endif
-
-   let l:cbuffer = bufnr("%")
-   let s:linedict[l:cbuffer] = line('$')
-   let l:buflines = getbufline(l:cbuffer, 1, 1000)
-   let l:querydata = copy(s:querydata)
-   call extend(l:querydata, g:llama_overrides)
-   if exists("w:llama_overrides")
-      call extend(l:querydata, w:llama_overrides)
-   endif
-   if exists("b:llama_overrides")
-      call extend(l:querydata, b:llama_overrides)
-   endif
-   if l:buflines[0][0:1] == '!*'
-      let l:userdata = json_decode(l:buflines[0][2:-1])
-      call extend(l:querydata, l:userdata)
-      let l:buflines = l:buflines[1:-1]
-   endif
-   let l:querydata.prompt = join(l:buflines, "\n")
-   let l:curlcommand = copy(s:curlcommand)
-   if exists("g:llama_api_key")
-       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
-   endif
-   let l:curlcommand[2] = json_encode(l:querydata)
-   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
-endfunction
-
-" Echos the tokkenization of the provided string , or cursor to end of word
-" Onus is placed on the user to include the preceding space
-func llama#tokenizeWord(...)
-    if (a:0 > 0)
-        let l:input = a:1
-    else
-        exe "normal \"*ye"
-        let l:input = @*
-    endif
-    let l:querydata = {"content": l:input}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
-endfunction
-
-func s:tokenizeWordCallback(plaintext, channel, msg)
-    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
-endfunction
-
-
-" Echos the token count of the entire buffer (or provided string)
-" Example usage :echo llama#tokenCount()
-func llama#tokenCount(...)
-    if (a:0 > 0)
-        let l:buflines = a:1
-    else
-        let l:buflines = getline(1,1000)
-        if l:buflines[0][0:1] == '!*'
-            let l:buflines = l:buflines[1:-1]
-        endif
-        let l:buflines = join(l:buflines, "\n")
-    endif
-    let l:querydata = {"content": l:buflines}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
-endfunction
-
-func s:tokenCountCallback(channel, msg)
-    let resp = json_decode(a:msg)
-    echo len(resp.tokens)
-endfunction
--- a/examples/llm.vim
+++ b/examples/llm.vim
@ -1,28 +0,0 @@
-" Basic plugin example
-
-function! Llm()
-
-  let url = "http://127.0.0.1:8080/completion"
-
-  " Get the content of the current buffer
-  let buffer_content = join(getline(1, '$'), "\n")
-
-  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
-  let json_payload.prompt = buffer_content
-
-  " Define the curl command
-  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
-  let response = system(curl_command, json_encode(json_payload))
-
-  " Extract the content field from the response
-  let content = json_decode(response).content
-
-  let split_newlines = split(content, '\n', 1)
-
-  " Insert the content at the cursor position
-  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
-endfunction
-
-command! Llm call Llm()
-noremap <F2> :Llm<CR>
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@ -1,312 +0,0 @@
-#!/usr/bin/env python3
-
-"""Function calling example using pydantic models."""
-
-from __future__ import annotations
-
-import argparse
-import datetime
-import json
-import logging
-import textwrap
-import sys
-from enum import Enum
-from typing import Optional, Union
-
-import requests
-from pydantic import BaseModel, Field
-from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
-                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
-
-
-def create_completion(host, prompt, gbnf_grammar):
-    """Calls the /completion API on llama-server.
-
-    See
-    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
-    """
-    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
-    headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": gbnf_grammar}
-    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
-    assert data.get("error") is None, data
-    logging.info("Result: %s", result)
-    content = result["content"]
-    print(f"  Model: {result['model']}")
-    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
-    return content
-
-
-# A function for the agent to send a message to the user.
-class SendMessageToUser(BaseModel):
-    """Send a message to the User."""
-    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
-    message: str = Field(..., description="Message you want to send to the user.")
-
-    def run(self):
-        print(f"SendMessageToUser: {self.message}")
-
-
-def example_rce(host):
-    """Minimal test case where the LLM call an arbitrary python function."""
-    print("- example_rce")
-    tools = [SendMessageToUser]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-    user_message = "What is 42 * 42?"
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    tools_map = {tool.__name__:tool for tool in tools}
-    # This finds "SendMessageToUser":
-    tool = tools_map.get(json_data["function"])
-    if not tool:
-        print(f"Error: unknown tool {json_data['function']}")
-        return 1
-    tool(**json_data["function_parameters"]).run()
-    return 0
-
-
-# Enum for the calculator tool.
-class MathOperation(Enum):
-    ADD = "add"
-    SUBTRACT = "subtract"
-    MULTIPLY = "multiply"
-    DIVIDE = "divide"
-
-
-# Simple pydantic calculator tool for the agent that can add, subtract,
-# multiply, and divide. Docstring and description of fields will be used in
-# system prompt.
-class Calculator(BaseModel):
-    """Perform a math operation on two numbers."""
-    number_one: Union[int, float] = Field(..., description="First number.")
-    operation: MathOperation = Field(..., description="Math operation to perform.")
-    number_two: Union[int, float] = Field(..., description="Second number.")
-
-    def run(self):
-        if self.operation == MathOperation.ADD:
-            return self.number_one + self.number_two
-        elif self.operation == MathOperation.SUBTRACT:
-            return self.number_one - self.number_two
-        elif self.operation == MathOperation.MULTIPLY:
-            return self.number_one * self.number_two
-        elif self.operation == MathOperation.DIVIDE:
-            return self.number_one / self.number_two
-        else:
-            raise ValueError("Unknown operation.")
-
-
-def example_calculator(host):
-    """Have the LLM ask to get a calculation done.
-
-    Here the grammar gets generated by passing the available function models to
-    generate_gbnf_grammar_and_documentation function. This also generates a
-    documentation usable by the LLM.
-
-    pydantic_model_list is the list of pydantic models outer_object_name is an
-    optional name for an outer object around the actual model object. Like a
-    "function" object with "function_parameters" which contains the actual model
-    object. If None, no outer object will be generated outer_object_content is
-    the name of outer object content.
-
-    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
-    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
-    """
-    print("- example_calculator")
-    tools = [SendMessageToUser, Calculator]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-    user_message1 = "What is 42 * 42?"
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    expected = {
-        "function": "Calculator",
-        "function_parameters": {
-            "number_one": 42,
-            "operation": "multiply",
-            "number_two": 42
-        }
-    }
-    if json_data != expected:
-        print("  Result is not as expected!")
-    tools_map = {tool.__name__:tool for tool in tools}
-    # This finds "Calculator":
-    tool = tools_map.get(json_data["function"])
-    if not tool:
-        print(f"Error: unknown tool {json_data['function']}")
-        return 1
-    result = tool(**json_data["function_parameters"]).run()
-    print(f"  Call {json_data['function']} gave result {result}")
-    return 0
-
-
-class Category(Enum):
-    """The category of the book."""
-    Fiction = "Fiction"
-    NonFiction = "Non-Fiction"
-
-
-class Book(BaseModel):
-    """Represents an entry about a book."""
-    title: str = Field(..., description="Title of the book.")
-    author: str = Field(..., description="Author of the book.")
-    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
-    keywords: list[str] = Field(..., description="A list of keywords.")
-    category: Category = Field(..., description="Category of the book.")
-    summary: str = Field(..., description="Summary of the book.")
-
-
-def example_struct(host):
-    """A example structured output based on pydantic models.
-
-    The LLM will create an entry for a Book database out of an unstructured
-    text. We need no additional parameters other than our list of pydantic
-    models.
-    """
-    print("- example_struct")
-    tools = [Book]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
-    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
-    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    # In this case, there's no function nor function_parameters.
-    # Here the result will vary based on the LLM used.
-    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
-    if keys != sorted(json_data.keys()):
-        print(f"Unexpected result: {sorted(json_data.keys())}")
-        return 1
-    book = Book(**json_data)
-    print(f"  As a Book object: %s" % book)
-    return 0
-
-
-def get_current_datetime(output_format: Optional[str] = None):
-    """Get the current date and time in the given format.
-
-    Args:
-         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
-    """
-    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
-
-
-# Example function to get the weather.
-def get_current_weather(location, unit):
-    """Get the current weather in a given location"""
-    if "London" in location:
-        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
-    elif "New York" in location:
-        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
-    elif "North Pole" in location:
-        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
-    return json.dumps({"location": location, "temperature": "unknown"})
-
-
-def example_concurrent(host):
-    """An example for parallel function calling with a Python function, a pydantic
-    function model and an OpenAI like function definition.
-    """
-    print("- example_concurrent")
-    # Function definition in OpenAI style.
-    current_weather_tool = {
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                },
-                "required": ["location"],
-            },
-        },
-    }
-    # Convert OpenAI function definition into pydantic model.
-    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
-    # Add the actual function to a pydantic model.
-    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
-
-    # Convert normal Python function to a pydantic model.
-    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
-
-    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
-    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
-    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    expected = [
-      {
-        "function": "get_current_datetime",
-        "params": {
-          "output_format": "%Y-%m-%d %H:%M:%S"
-        }
-      },
-      {
-        "function": "get_current_weather",
-        "params": {
-          "location": "London",
-          "unit": "celsius"
-        }
-      },
-      {
-        "function": "Calculator",
-        "params": {
-          "number_one": 42,
-          "operation": "multiply",
-          "number_two": 42
-        }
-      }
-    ]
-    res = 0
-    if json_data != expected:
-        print("  Result is not as expected!")
-        print("  This can happen on highly quantized models")
-        res = 1
-    tools_map = {tool.__name__:tool for tool in tools}
-    for call in json_data:
-      tool = tools_map.get(call["function"])
-      if not tool:
-          print(f"Error: unknown tool {call['function']}")
-          return 1
-      result = tool(**call["params"]).run()
-      print(f"  Call {call['function']} returned {result}")
-    # Should output something like this:
-    #   Call get_current_datetime returned 2024-07-15 09:50:38
-    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
-    #   Call Calculator returned 1764
-    return res
-
-
-def main():
-    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
-    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
-    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
-    ret = 0
-    # Comment out below to only run the example you want.
-    ret = ret or example_rce(args.host)
-    ret = ret or example_calculator(args.host)
-    ret = ret or example_struct(args.host)
-    ret = ret or example_concurrent(args.host)
-    return ret
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@ -1,16 +0,0 @@
-#!/bin/bash
-
-cd `dirname $0`
-cd ..
-
-# get -m model parameter otherwise defer to default
-if [ "$1" == "-m" ]; then
-  MODEL="-m $2 "
-fi
-
-./llama-cli $MODEL --color \
-    -f ./prompts/reason-act.txt \
-    -i --interactive-first \
-    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
-    -r "Question:" -r "Observation:" --in-prefix " " \
-    -n -1
--- a/examples/regex_to_grammar.py
+++ b/examples/regex_to_grammar.py
@ -1,20 +0,0 @@
-import json, subprocess, sys, os
-
-assert len(sys.argv) >= 2
-[_, pattern, *rest] = sys.argv
-
-print(subprocess.check_output(
-    [
-        "python",
-        os.path.join(
-        os.path.dirname(os.path.realpath(__file__)),
-        "json_schema_to_grammar.py"),
-        *rest,
-        "-",
-        "--raw-pattern",
-    ],
-    text=True,
-    input=json.dumps({
-        "type": "string",
-        "pattern": pattern,
-    }, indent=2)))
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-# Specify the model you want to use here:
-MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-12}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
-
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-server $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --rope-freq-scale 1.0 \
-  "$@"
-
-# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
-# -ngl 1 \
--- a/examples/server_embd.py
+++ b/examples/server_embd.py
@ -1,35 +0,0 @@
-import asyncio
-import asyncio.threads
-import requests
-import numpy as np
-
-
-n = 8
-
-result = []
-
-async def requests_post_async(*args, **kwargs):
-    return await asyncio.threads.to_thread(requests.post, *args, **kwargs)
-
-async def main():
-    model_url = "http://127.0.0.1:6900"
-    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
-        url= f"{model_url}/embedding",
-        json= {"content": str(0)*1024}
-    ) for i in range(n)])
-
-    for response in responses:
-        embedding = response.json()["embedding"]
-        print(embedding[-8:])
-        result.append(embedding)
-
-asyncio.run(main())
-
-# compute cosine similarity
-
-for i in range(n-1):
-    for j in range(i+1, n):
-        embedding1 = np.array(result[i])
-        embedding2 = np.array(result[j])
-        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
-        print(f"Similarity between {i} and {j}: {similarity:.2f}")
--- a/examples/ts-type-to-grammar.sh
+++ b/examples/ts-type-to-grammar.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-#
-# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
-# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
-#
-set -euo pipefail
-
-readonly type="$1"
-
-# Create a temporary directory
-TMPDIR=""
-trap 'rm -fR "$TMPDIR"' EXIT
-TMPDIR=$(mktemp -d)
-
-DTS_FILE="$TMPDIR/type.d.ts"
-SCHEMA_FILE="$TMPDIR/schema.json"
-
-echo "export type MyType = $type" > "$DTS_FILE"
-
-# This is a fork of typescript-json-schema, actively maintained as of March 2024:
-# https://github.com/vega/ts-json-schema-generator
-npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type MyType -e none > "$SCHEMA_FILE"
-
-# Alternative, not actively maintained as of March 2024:
-# https://github.com/YousefED/typescript-json-schema
-# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
-
-./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
--- a/include/llama.h
+++ b/include/llama.h
@ -1163,7 +1163,7 @@ extern "C" {
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
+    LLAMA_API void antigma_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);

    // Print system information
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -19097,17 +19097,17 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
    return result;
 }

-void llama_print_timings(struct llama_context * ctx) {
+void antigma_print_timings(struct llama_context * ctx) {
    const llama_timings timings = llama_get_timings(ctx);

    LLAMA_LOG_INFO("\n");
-    // LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    // LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-    //         __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    // LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-    //         __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    // LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-    //         __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
+    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
    LLAMA_LOG_INFO("Antigma timer:       total time = %10.2f ms / %5d tokens\n", (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
 }