From 3fe81781e3bf98b8e44946240a19f3a6ad08a11a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 12 Jan 2024 20:38:54 +0100
Subject: [PATCH 001/138] CUDA: faster q8_0 -> f16 dequantization (#4895)

---
 ggml-cuda.cu | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 2db50437c..bd3814c72 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -523,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 
+#define CUDA_Q8_0_NE_ALIGN 2048
+
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
 #define GGML_CUDA_DMMV_X 32
@@ -2327,6 +2329,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
     y[i] = x[i];
 }
 
+template <bool need_check>
+static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
+#if __CUDA_ARCH__ >= CC_PASCAL
+    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
+
+    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
+    const int * x0 = ((int *) vx) + blockIdx.x * nint;
+    half2 * y2 = (half2 *) (y + i0);
+
+    __shared__ int vals[nint];
+
+#pragma unroll
+    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
+        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
+            break;
+        }
+
+        const int ix = ix0 + threadIdx.x;
+        vals[ix] = x0[ix];
+    }
+
+#pragma unroll
+    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
+        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
+            return;
+        }
+
+        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
+        const half    d = *b0;
+        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
+
+        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
+    }
+#else
+    (void) vx; (void) y; (void) k;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_PASCAL
+}
+
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
 
@@ -6181,6 +6222,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
     dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }
 
+static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
+    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
+        const bool need_check = false;
+        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
+    } else {
+        const bool need_check = true;
+        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
+    }
+}
+
 template<typename dst_t>
 static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb = k / QK_K;
@@ -6246,6 +6298,7 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
 }
 
 static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    int id;
     switch (type) {
         case GGML_TYPE_Q4_0:
             return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
@@ -6256,6 +6309,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
         case GGML_TYPE_Q5_1:
             return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
         case GGML_TYPE_Q8_0:
+            CUDA_CHECK(cudaGetDevice(&id));
+            if (g_device_caps[id].cc >= CC_PASCAL) {
+                return dequantize_block_q8_0_f16_cuda;
+            }
             return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
         case GGML_TYPE_Q2_K:
             return dequantize_row_q2_K_cuda;

From 52ee4540c0f2e11d52c839db6eb51d014ce060e1 Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Fri, 12 Jan 2024 20:46:45 +0100
Subject: [PATCH 002/138] examples : add pydantic models to GBNF grammar
 generator (#4883)

* Create pydantic-models-to-grammar.py

* Added some comments for usage

* Refactored Grammar Generator

Added example and usage instruction.

* Update pydantic_models_to_grammar.py

* Update pydantic-models-to-grammar-examples.py

* Renamed module and imported it.

* Update pydantic-models-to-grammar.py

* Renamed file and fixed grammar generator issue.
---
 .../pydantic-models-to-grammar-examples.py    |  136 ++
 examples/pydantic_models_to_grammar.py        | 1151 +++++++++++++++++
 2 files changed, 1287 insertions(+)
 create mode 100644 examples/pydantic-models-to-grammar-examples.py
 create mode 100644 examples/pydantic_models_to_grammar.py

diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py
new file mode 100644
index 000000000..a8a4919cf
--- /dev/null
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -0,0 +1,136 @@
+# Function calling example using pydantic models.
+
+import json
+from enum import Enum
+from typing import Union, Optional
+
+import requests
+from pydantic import BaseModel, Field
+
+import importlib
+from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
+
+# Function to get completion on the llama.cpp server with grammar.
+def create_completion(prompt, grammar):
+    headers = {"Content-Type": "application/json"}
+    data = {"prompt": prompt, "grammar": grammar}
+
+    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
+    data = response.json()
+
+    print(data["content"])
+    return data["content"]
+
+
+# A function for the agent to send a message to the user.
+class SendMessageToUser(BaseModel):
+    """
+    Send a message to the User.
+    """
+    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
+    message: str = Field(..., description="Message you want to send to the user.")
+
+    def run(self):
+        print(self.message)
+
+
+# Enum for the calculator function.
+class MathOperation(Enum):
+    ADD = "add"
+    SUBTRACT = "subtract"
+    MULTIPLY = "multiply"
+    DIVIDE = "divide"
+
+
+# Very simple calculator tool for the agent.
+class Calculator(BaseModel):
+    """
+    Perform a math operation on two numbers.
+    """
+    number_one: Union[int, float] = Field(..., description="First number.")
+    operation: MathOperation = Field(..., description="Math operation to perform.")
+    number_two: Union[int, float] = Field(..., description="Second number.")
+
+    def run(self):
+        if self.operation == MathOperation.ADD:
+            return self.number_one + self.number_two
+        elif self.operation == MathOperation.SUBTRACT:
+            return self.number_one - self.number_two
+        elif self.operation == MathOperation.MULTIPLY:
+            return self.number_one * self.number_two
+        elif self.operation == MathOperation.DIVIDE:
+            return self.number_one / self.number_two
+        else:
+            raise ValueError("Unknown operation.")
+
+
+# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
+# pydantic_model_list is the list of pydanitc models
+# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
+# outer_object_content is the name of outer object content.
+# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
+    outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+
+print(gbnf_grammar)
+print(documentation)
+
+system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+
+user_message = "What is 42 * 42?"
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+# This should output something like this:
+# {
+#     "function": "calculator",
+#     "function_parameters": {
+#         "number_one": 42,
+#         "operation": "multiply",
+#         "number_two": 42
+#     }
+# }
+function_dictionary = json.loads(text)
+if function_dictionary["function"] == "calculator":
+    function_parameters = {**function_dictionary["function_parameters"]}
+
+    print(Calculator(**function_parameters).run())
+    # This should output: 1764
+
+
+# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
+class Category(Enum):
+    """
+    The category of the book.
+    """
+    Fiction = "Fiction"
+    NonFiction = "Non-Fiction"
+
+
+class Book(BaseModel):
+    """
+    Represents an entry about a book.
+    """
+    title: str = Field(..., description="Title of the book.")
+    author: str = Field(..., description="Author of the book.")
+    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
+    keywords: list[str] = Field(..., description="A list of keywords.")
+    category: Category = Field(..., description="Category of the book.")
+    summary: str = Field(..., description="Summary of the book.")
+
+
+# We need no additional parameters other than our list of pydantic models.
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
+
+system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
+
+text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+
+json_data = json.loads(text)
+
+print(Book(**json_data))
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
new file mode 100644
index 000000000..41b98fdc1
--- /dev/null
+++ b/examples/pydantic_models_to_grammar.py
@@ -0,0 +1,1151 @@
+import inspect
+import json
+from copy import copy
+from inspect import isclass, getdoc
+from types import NoneType
+
+from pydantic import BaseModel, create_model, Field
+from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
+from enum import Enum
+from typing import get_type_hints, Callable
+import re
+
+
+class PydanticDataType(Enum):
+    """
+    Defines the data types supported by the grammar_generator.
+
+    Attributes:
+        STRING (str): Represents a string data type.
+        BOOLEAN (str): Represents a boolean data type.
+        INTEGER (str): Represents an integer data type.
+        FLOAT (str): Represents a float data type.
+        OBJECT (str): Represents an object data type.
+        ARRAY (str): Represents an array data type.
+        ENUM (str): Represents an enum data type.
+        CUSTOM_CLASS (str): Represents a custom class data type.
+    """
+    STRING = "string"
+    TRIPLE_QUOTED_STRING = "triple_quoted_string"
+    MARKDOWN_STRING = "markdown_string"
+    BOOLEAN = "boolean"
+    INTEGER = "integer"
+    FLOAT = "float"
+    OBJECT = "object"
+    ARRAY = "array"
+    ENUM = "enum"
+    ANY = "any"
+    NULL = "null"
+    CUSTOM_CLASS = "custom-class"
+    CUSTOM_DICT = "custom-dict"
+    SET = "set"
+
+
+def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
+    if isclass(pydantic_type) and issubclass(pydantic_type, str):
+        return PydanticDataType.STRING.value
+    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
+        return PydanticDataType.BOOLEAN.value
+    elif isclass(pydantic_type) and issubclass(pydantic_type, int):
+        return PydanticDataType.INTEGER.value
+    elif isclass(pydantic_type) and issubclass(pydantic_type, float):
+        return PydanticDataType.FLOAT.value
+    elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
+        return PydanticDataType.ENUM.value
+
+    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
+        return format_model_and_field_name(pydantic_type.__name__)
+    elif get_origin(pydantic_type) == list:
+        element_type = get_args(pydantic_type)[0]
+        return f"{map_pydantic_type_to_gbnf(element_type)}-list"
+    elif get_origin(pydantic_type) == set:
+        element_type = get_args(pydantic_type)[0]
+        return f"{map_pydantic_type_to_gbnf(element_type)}-set"
+    elif get_origin(pydantic_type) == Union:
+        union_types = get_args(pydantic_type)
+        union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
+        return f"union-{'-or-'.join(union_rules)}"
+    elif get_origin(pydantic_type) == Optional:
+        element_type = get_args(pydantic_type)[0]
+        return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
+    elif isclass(pydantic_type):
+        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
+    elif get_origin(pydantic_type) == dict:
+        key_type, value_type = get_args(pydantic_type)
+        return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
+    else:
+        return "unknown"
+
+
+def format_model_and_field_name(model_name: str) -> str:
+    parts = re.findall('[A-Z][^A-Z]*', model_name)
+    if not parts:  # Check if the list is empty
+        return model_name.lower().replace("_", "-")
+    return '-'.join(part.lower().replace("_", "-") for part in parts)
+
+
+def generate_list_rule(element_type):
+    """
+    Generate a GBNF rule for a list of a given element type.
+
+    :param element_type: The type of the elements in the list (e.g., 'string').
+    :return: A string representing the GBNF rule for a list of the given type.
+    """
+    rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list"
+    element_rule = map_pydantic_type_to_gbnf(element_type)
+    list_rule = fr'{rule_name} ::= "["  {element_rule} (","  {element_rule})* "]"'
+    return list_rule
+
+
+def get_members_structure(cls, rule_name):
+    if issubclass(cls, Enum):
+        # Handle Enum types
+        members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()]
+        return f"{cls.__name__.lower()} ::= " + " | ".join(members)
+    if cls.__annotations__ and cls.__annotations__ != {}:
+        result = f'{rule_name} ::= "{{"'
+        type_list_rules = []
+        # Modify this comprehension
+        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param_type)}'
+                   for name, param_type in cls.__annotations__.items()
+                   if name != 'self']
+
+        result += '"," '.join(members)
+        result += '  "}"'
+        return result, type_list_rules
+    elif rule_name == "custom-class-any":
+        result = f'{rule_name} ::= '
+        result += 'value'
+        type_list_rules = []
+        return result, type_list_rules
+    else:
+        init_signature = inspect.signature(cls.__init__)
+        parameters = init_signature.parameters
+        result = f'{rule_name} ::=  "{{"'
+        type_list_rules = []
+        # Modify this comprehension too
+        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+                   for name, param in parameters.items()
+                   if name != 'self' and param.annotation != inspect.Parameter.empty]
+
+        result += '", "'.join(members)
+        result += '  "}"'
+        return result, type_list_rules
+
+
+def regex_to_gbnf(regex_pattern: str) -> str:
+    """
+    Translate a basic regex pattern to a GBNF rule.
+    Note: This function handles only a subset of simple regex patterns.
+    """
+    gbnf_rule = regex_pattern
+
+    # Translate common regex components to GBNF
+    gbnf_rule = gbnf_rule.replace('\\d', '[0-9]')
+    gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]')
+
+    # Handle quantifiers and other regex syntax that is similar in GBNF
+    # (e.g., '*', '+', '?', character classes)
+
+    return gbnf_rule
+
+
+def generate_gbnf_integer_rules(max_digit=None, min_digit=None):
+    """
+
+    Generate GBNF Integer Rules
+
+    Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits.
+
+    Parameters:
+    max_digit (int): The maximum number of digits for the integer. Default is None.
+    min_digit (int): The minimum number of digits for the integer. Default is None.
+
+    Returns:
+    integer_rule (str): The identifier for the integer rule generated.
+    additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits.
+
+    """
+    additional_rules = []
+
+    # Define the rule identifier based on max_digit and min_digit
+    integer_rule = "integer-part"
+    if max_digit is not None:
+        integer_rule += f"-max{max_digit}"
+    if min_digit is not None:
+        integer_rule += f"-min{min_digit}"
+
+    # Handling Integer Rules
+    if max_digit is not None or min_digit is not None:
+        # Start with an empty rule part
+        integer_rule_part = ''
+
+        # Add mandatory digits as per min_digit
+        if min_digit is not None:
+            integer_rule_part += '[0-9] ' * min_digit
+
+        # Add optional digits up to max_digit
+        if max_digit is not None:
+            optional_digits = max_digit - (min_digit if min_digit is not None else 0)
+            integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)])
+
+        # Trim the rule part and append it to additional rules
+        integer_rule_part = integer_rule_part.strip()
+        if integer_rule_part:
+            additional_rules.append(f'{integer_rule} ::= {integer_rule_part}')
+
+    return integer_rule, additional_rules
+
+
+def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None, min_precision=None):
+    """
+    Generate GBNF float rules based on the given constraints.
+
+    :param max_digit: Maximum number of digits in the integer part (default: None)
+    :param min_digit: Minimum number of digits in the integer part (default: None)
+    :param max_precision: Maximum number of digits in the fractional part (default: None)
+    :param min_precision: Minimum number of digits in the fractional part (default: None)
+    :return: A tuple containing the float rule and additional rules as a list
+
+    Example Usage:
+    max_digit = 3
+    min_digit = 1
+    max_precision = 2
+    min_precision = 1
+    generate_gbnf_float_rules(max_digit, min_digit, max_precision, min_precision)
+
+    Output:
+    ('float-3-1-2-1', ['integer-part-max3-min1 ::= [0-9] [0-9] [0-9]?', 'fractional-part-max2-min1 ::= [0-9] [0-9]?', 'float-3-1-2-1 ::= integer-part-max3-min1 "." fractional-part-max2-min
+    *1'])
+
+    Note:
+    GBNF stands for Generalized Backus-Naur Form, which is a notation technique to specify the syntax of programming languages or other formal grammars.
+    """
+    additional_rules = []
+
+    # Define the integer part rule
+    integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
+        f"-min{min_digit}" if min_digit is not None else "")
+
+    # Define the fractional part rule based on precision constraints
+    fractional_part_rule = "fractional-part"
+    fractional_rule_part = ''
+    if max_precision is not None or min_precision is not None:
+        fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + (
+            f"-min{min_precision}" if min_precision is not None else "")
+        # Minimum number of digits
+        fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1)
+        # Optional additional digits
+        fractional_rule_part += ''.join([' [0-9]?'] * (
+            (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0))
+        additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}')
+
+    # Define the float rule
+    float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}"
+    additional_rules.append(f'{float_rule} ::= {integer_part_rule} "." {fractional_part_rule}')
+
+    # Generating the integer part rule definition, if necessary
+    if max_digit is not None or min_digit is not None:
+        integer_rule_part = '[0-9]'
+        if min_digit is not None and min_digit > 1:
+            integer_rule_part += ' [0-9]' * (min_digit - 1)
+        if max_digit is not None:
+            integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1)))
+        additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}')
+
+    return float_rule, additional_rules
+
+
+def generate_gbnf_rule_for_type(model_name, field_name,
+                                field_type, is_optional, processed_models, created_rules,
+                                field_info=None) -> \
+    Tuple[str, list]:
+    """
+    Generate GBNF rule for a given field type.
+
+    :param model_name: Name of the model.
+
+    :param field_name: Name of the field.
+    :param field_type: Type of the field.
+    :param is_optional: Whether the field is optional.
+    :param processed_models: List of processed models.
+    :param created_rules: List of created rules.
+    :param field_info: Additional information about the field (optional).
+
+    :return: Tuple containing the GBNF type and a list of additional rules.
+    :rtype: Tuple[str, list]
+    """
+    rules = []
+
+    field_name = format_model_and_field_name(field_name)
+    gbnf_type = map_pydantic_type_to_gbnf(field_type)
+
+    if isclass(field_type) and issubclass(field_type, BaseModel):
+        nested_model_name = format_model_and_field_name(field_type.__name__)
+        nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules)
+        rules.extend(nested_model_rules)
+        gbnf_type, rules = nested_model_name, rules
+    elif isclass(field_type) and issubclass(field_type, Enum):
+        enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type]  # Adding escaped quotes
+        enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
+        rules.append(enum_rule)
+        gbnf_type, rules = model_name + "-" + field_name, rules
+    elif get_origin(field_type) == list or field_type == list:  # Array
+        element_type = get_args(field_type)[0]
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
+                                                                          f"{field_name}-element",
+                                                                          element_type, is_optional, processed_models,
+                                                                          created_rules)
+        rules.extend(additional_rules)
+        array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
+        rules.append(array_rule)
+        gbnf_type, rules = model_name + "-" + field_name, rules
+
+    elif get_origin(field_type) == set or field_type == set:  # Array
+        element_type = get_args(field_type)[0]
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
+                                                                          f"{field_name}-element",
+                                                                          element_type, is_optional, processed_models,
+                                                                          created_rules)
+        rules.extend(additional_rules)
+        array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
+        rules.append(array_rule)
+        gbnf_type, rules = model_name + "-" + field_name, rules
+
+    elif gbnf_type.startswith("custom-class-"):
+        nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
+        rules.append(nested_model_rules)
+    elif gbnf_type.startswith("custom-dict-"):
+        key_type, value_type = get_args(field_type)
+
+        additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name,
+                                                                                f"{field_name}-key-type",
+                                                                                key_type, is_optional, processed_models,
+                                                                                created_rules)
+        additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name,
+                                                                                    f"{field_name}-value-type",
+                                                                                    value_type, is_optional,
+                                                                                    processed_models, created_rules)
+        gbnf_type = fr'{gbnf_type} ::= "{{"  ( {additional_key_type} ":"  {additional_value_type} (","  {additional_key_type} ":"  {additional_value_type})*  )? "}}" '
+
+        rules.extend(additional_key_rules)
+        rules.extend(additional_value_rules)
+    elif gbnf_type.startswith("union-"):
+        union_types = get_args(field_type)
+        union_rules = []
+
+        for union_type in union_types:
+            if isinstance(union_type, _GenericAlias):
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
+                                                                                field_name, union_type,
+                                                                                False,
+                                                                                processed_models, created_rules)
+                union_rules.append(union_gbnf_type)
+                rules.extend(union_rules_list)
+
+
+            elif not issubclass(union_type, NoneType):
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
+                                                                                field_name, union_type,
+                                                                                False,
+                                                                                processed_models, created_rules)
+                union_rules.append(union_gbnf_type)
+                rules.extend(union_rules_list)
+
+        # Defining the union grammar rule separately
+        if len(union_rules) == 1:
+            union_grammar_rule = f"{model_name}-{field_name}-optional ::= {' | '.join(union_rules)} | null"
+        else:
+            union_grammar_rule = f"{model_name}-{field_name}-union ::= {' | '.join(union_rules)}"
+        rules.append(union_grammar_rule)
+        if len(union_rules) == 1:
+            gbnf_type = f"{model_name}-{field_name}-optional"
+        else:
+            gbnf_type = f"{model_name}-{field_name}-union"
+    elif isclass(field_type) and issubclass(field_type, str):
+        if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None:
+
+            triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False)
+            markdown_string = field_info.json_schema_extra.get('markdown_string', False)
+
+            gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value
+            gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type
+
+        elif field_info and hasattr(field_info, 'pattern'):
+            # Convert regex pattern to grammar rule
+            regex_pattern = field_info.regex.pattern
+            gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}"
+        else:
+            gbnf_type = PydanticDataType.STRING.value
+
+    elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info,
+                                                                                          'json_schema_extra') and field_info.json_schema_extra is not None:
+        # Retrieve precision attributes for floats
+        max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info,
+                                                                                                    'json_schema_extra') else None
+        min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info,
+                                                                                                    'json_schema_extra') else None
+        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
+                                                                                             'json_schema_extra') else None
+        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
+                                                                                             'json_schema_extra') else None
+
+        # Generate GBNF rule for float with given attributes
+        gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits,
+                                                     max_precision=max_precision,
+                                                     min_precision=min_precision)
+
+    elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info,
+                                                                                        'json_schema_extra') and field_info.json_schema_extra is not None:
+        # Retrieve digit attributes for integers
+        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
+                                                                                             'json_schema_extra') else None
+        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
+                                                                                             'json_schema_extra') else None
+
+        # Generate GBNF rule for integer with given attributes
+        gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits)
+    else:
+        gbnf_type, rules = gbnf_type, []
+
+    if gbnf_type not in created_rules:
+        return gbnf_type, rules
+    else:
+        if gbnf_type in created_rules:
+            return gbnf_type, rules
+
+
+def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
+    """
+
+    Generate GBnF Grammar
+
+    Generates a GBnF grammar for a given model.
+
+    :param model: A Pydantic model class to generate the grammar for. Must be a subclass of BaseModel.
+    :param processed_models: A set of already processed models to prevent infinite recursion.
+    :param created_rules: A dict containing already created rules to prevent duplicates.
+    :return: A list of GBnF grammar rules in string format. And two booleans indicating if an extra markdown or triple quoted string is in the grammar.
+    Example Usage:
+    ```
+    model = MyModel
+    processed_models = set()
+    created_rules = dict()
+
+    gbnf_grammar = generate_gbnf_grammar(model, processed_models, created_rules)
+    ```
+    """
+    if model in processed_models:
+        return []
+
+    processed_models.add(model)
+    model_name = format_model_and_field_name(model.__name__)
+
+    if not issubclass(model, BaseModel):
+        # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
+        if hasattr(model, '__annotations__') and model.__annotations__:
+            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
+        else:
+            init_signature = inspect.signature(model.__init__)
+            parameters = init_signature.parameters
+            model_fields = {name: (param.annotation, param.default) for name, param in parameters.items()
+                            if name != 'self'}
+    else:
+        # For Pydantic models, use model_fields and check for ellipsis (required fields)
+        model_fields = model.__annotations__
+
+    model_rule_parts = []
+    nested_rules = []
+    has_markdown_code_block = False
+    has_triple_quoted_string = False
+    look_for_markdown_code_block = False
+    look_for_triple_quoted_string = False
+    for field_name, field_info in model_fields.items():
+        if not issubclass(model, BaseModel):
+            field_type, default_value = field_info
+            # Check if the field is optional (not required)
+            is_optional = (default_value is not inspect.Parameter.empty) and (default_value is not Ellipsis)
+        else:
+            field_type = field_info
+            field_info = model.model_fields[field_name]
+            is_optional = field_info.is_required is False and get_origin(field_type) is Optional
+        rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
+                                                                  format_model_and_field_name(field_name),
+                                                                  field_type, is_optional,
+                                                                  processed_models, created_rules, field_info)
+        look_for_markdown_code_block = True if rule_name == "markdown_string" else False
+        look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False
+        if not look_for_markdown_code_block and not look_for_triple_quoted_string:
+            if rule_name not in created_rules:
+                created_rules[rule_name] = additional_rules
+            model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": "  {rule_name}')  # Adding escaped quotes
+            nested_rules.extend(additional_rules)
+        else:
+            has_triple_quoted_string = look_for_markdown_code_block
+            has_markdown_code_block = look_for_triple_quoted_string
+
+    fields_joined = r' "," "\n" '.join(model_rule_parts)
+    model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"'
+
+    if look_for_markdown_code_block or look_for_triple_quoted_string:
+        model_rule += ' ws "}"'
+
+    if has_triple_quoted_string:
+        model_rule += '"\\n" triple-quoted-string'
+    if has_markdown_code_block:
+        model_rule += '"\\n" markdown-code-block'
+    all_rules = [model_rule] + nested_rules
+
+    return all_rules, has_markdown_code_block, has_triple_quoted_string
+
+
+def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None,
+                                               outer_object_content: str = None, list_of_outputs: bool = False) -> str:
+    """
+    Generate GBNF Grammar from Pydantic Models.
+
+    This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated
+    * grammar.
+
+    Parameters:
+    models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+    list_of_outputs (str, optional): Allows a list of output objects
+    Returns:
+    str: The generated GBNF grammar string.
+
+    Examples:
+        models = [UserModel, PostModel]
+        grammar = generate_gbnf_grammar_from_pydantic(models)
+        print(grammar)
+        # Output:
+        # root ::= UserModel | PostModel
+        # ...
+    """
+    processed_models = set()
+    all_rules = []
+    created_rules = {}
+    if outer_object_name is None:
+
+        for model in models:
+            model_rules, _, _ = generate_gbnf_grammar(model,
+                                                      processed_models, created_rules)
+            all_rules.extend(model_rules)
+
+        if list_of_outputs:
+            root_rule = r'root ::= ws "["  grammar-models (","  grammar-models)*  "]"' + "\n"
+        else:
+            root_rule = r'root ::= ws grammar-models' + "\n"
+        root_rule += "grammar-models ::= " + " | ".join(
+            [format_model_and_field_name(model.__name__) for model in models])
+        all_rules.insert(0, root_rule)
+        return "\n".join(all_rules)
+    elif outer_object_name is not None:
+        if list_of_outputs:
+            root_rule = fr'root ::= ws "["  {format_model_and_field_name(outer_object_name)} (","  {format_model_and_field_name(outer_object_name)})*  "]"' + "\n"
+        else:
+            root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n"
+
+        model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\""  ": "  grammar-models'
+
+        fields_joined = " | ".join(
+            [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models])
+
+        grammar_model_rules = f'\ngrammar-models ::= {fields_joined}'
+        mod_rules = []
+        for model in models:
+            mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws'
+            mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n'
+            mod_rules.append(mod_rule)
+        grammar_model_rules += "\n" + "\n".join(mod_rules)
+        look_for_markdown_code_block = False
+        look_for_triple_quoted_string = False
+        for model in models:
+            model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model,
+                                                                                      processed_models, created_rules)
+            all_rules.extend(model_rules)
+            if markdown_block:
+                look_for_markdown_code_block = True
+
+            if triple_quoted_string:
+                look_for_triple_quoted_string = True
+
+        if not look_for_markdown_code_block and not look_for_triple_quoted_string:
+            model_rule += ' ws "}"'
+        all_rules.insert(0, root_rule + model_rule + grammar_model_rules)
+        return "\n".join(all_rules)
+
+
+def get_primitive_grammar(grammar):
+    """
+    Returns the needed GBNF primitive grammar for a given GBNF grammar string.
+
+    Args:
+    grammar (str): The string containing the GBNF grammar.
+
+    Returns:
+    str: GBNF primitive grammar string.
+    """
+    type_list = []
+    if "string-list" in grammar:
+        type_list.append(str)
+    if "boolean-list" in grammar:
+        type_list.append(bool)
+    if "integer-list" in grammar:
+        type_list.append(int)
+    if "float-list" in grammar:
+        type_list.append(float)
+    additional_grammar = [generate_list_rule(t) for t in type_list]
+    primitive_grammar = r"""
+boolean ::= "true" | "false"
+null ::= "null"
+string ::= "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" ws
+ws ::= ([ \t\n] ws)?
+float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+
+integer ::= [0-9]+"""
+
+    any_block = ""
+    if "custom-class-any" in grammar:
+        any_block = '''
+value ::= object | array | string | number | boolean | null
+
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+
+number ::= integer | float'''
+
+    markdown_code_block_grammar = ""
+    if "markdown-code-block" in grammar:
+        markdown_code_block_grammar = r'''
+markdown-code-block ::= opening-triple-ticks markdown-code-block-content closing-triple-ticks
+markdown-code-block-content ::= ( [^`] | "`" [^`] |  "`"  "`" [^`]  )*
+opening-triple-ticks ::= "```" "python" "\n" | "```" "c" "\n" | "```" "cpp" "\n" | "```" "txt" "\n" | "```" "text" "\n" | "```" "json" "\n" | "```" "javascript" "\n" | "```" "css" "\n" | "```" "html" "\n" | "```" "markdown" "\n"
+closing-triple-ticks ::= "```" "\n"'''
+
+    if "triple-quoted-string" in grammar:
+        markdown_code_block_grammar = r"""
+triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes
+triple-quoted-string-content ::= ( [^'] | "'" [^'] |  "'"  "'" [^']  )*
+triple-quotes ::= "'''" """
+    return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar
+
+
+def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str:
+    indent = '  ' * depth
+    field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): "
+
+    # Extracting field description from Pydantic Field using __model_fields__
+    field_info = model.model_fields.get(field_name)
+    field_description = field_info.description if field_info and field_info.description else "No description available."
+
+    field_markdown += field_description + '\n'
+
+    # Handling nested BaseModel fields
+    if isclass(field_type) and issubclass(field_type, BaseModel):
+        field_markdown += f"{indent}  - Details:\n"
+        for name, type_ in field_type.__annotations__.items():
+            field_markdown += generate_field_markdown(name, type_, field_type, depth + 2)
+
+    return field_markdown
+
+
+def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str:
+    markdown = ""
+    for model in pydantic_models:
+        markdown += f"### {format_model_and_field_name(model.__name__)}\n"
+
+        # Check if the model's docstring is different from BaseModel's docstring
+        class_doc = getdoc(model)
+        base_class_doc = getdoc(BaseModel)
+        class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available."
+
+        markdown += f"{class_description}\n\n"
+        markdown += "#### Fields\n"
+
+        if isclass(model) and issubclass(model, BaseModel):
+            for name, field_type in model.__annotations__.items():
+                markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model)
+        markdown += "\n"
+
+    return markdown
+
+
+def format_json_example(example: dict, depth: int) -> str:
+    """
+    Format a JSON example into a readable string with indentation.
+
+    Args:
+    example (dict): JSON example to be formatted.
+    depth (int): Indentation depth.
+
+    Returns:
+    str: Formatted JSON example string.
+    """
+    indent = '    ' * depth
+    formatted_example = '{\n'
+    for key, value in example.items():
+        value_text = f"'{value}'" if isinstance(value, str) else value
+        formatted_example += f"{indent}{key}: {value_text},\n"
+    formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}'
+    return formatted_example
+
+
+def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model",
+                                fields_prefix="Fields", documentation_with_field_description=True) -> str:
+    """
+    Generate text documentation for a list of Pydantic models.
+
+    Args:
+    pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+    model_prefix (str): Prefix for the model section.
+    fields_prefix (str): Prefix for the fields section.
+    documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+    str: Generated text documentation.
+    """
+    documentation = ""
+    pyd_models = [(model, True) for model in pydantic_models]
+    for model, add_prefix in pyd_models:
+        if add_prefix:
+            documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n"
+        else:
+            documentation += f"Model: {format_model_and_field_name(model.__name__)}\n"
+
+        # Handling multi-line model description with proper indentation
+
+        class_doc = getdoc(model)
+        base_class_doc = getdoc(BaseModel)
+        class_description = class_doc if class_doc and class_doc != base_class_doc else ""
+        if class_description != "":
+            documentation += "  Description: "
+            documentation += "\n" + format_multiline_description(class_description, 2) + "\n"
+
+        if add_prefix:
+            # Indenting the fields section
+            documentation += f"  {fields_prefix}:\n"
+        else:
+            documentation += f"  Fields:\n"
+        if isclass(model) and issubclass(model, BaseModel):
+            for name, field_type in model.__annotations__.items():
+                # if name == "markdown_code_block":
+                #    continue
+                if get_origin(field_type) == list:
+                    element_type = get_args(field_type)[0]
+                    if isclass(element_type) and issubclass(element_type, BaseModel):
+                        pyd_models.append((element_type, False))
+                if get_origin(field_type) == Union:
+                    element_types = get_args(field_type)
+                    for element_type in element_types:
+                        if isclass(element_type) and issubclass(element_type, BaseModel):
+                            pyd_models.append((element_type, False))
+                documentation += generate_field_text(name, field_type, model,
+                                                     documentation_with_field_description=documentation_with_field_description)
+            documentation += "\n"
+
+        if hasattr(model, 'Config') and hasattr(model.Config,
+                                                'json_schema_extra') and 'example' in model.Config.json_schema_extra:
+            documentation += f"  Expected Example Output for {format_model_and_field_name(model.__name__)}:\n"
+            json_example = json.dumps(model.Config.json_schema_extra['example'])
+            documentation += format_multiline_description(json_example, 2) + "\n"
+
+    return documentation
+
+
+def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+                        documentation_with_field_description=True) -> str:
+    """
+    Generate text documentation for a Pydantic model field.
+
+    Args:
+    field_name (str): Name of the field.
+    field_type (Type[Any]): Type of the field.
+    model (Type[BaseModel]): Pydantic model class.
+    depth (int): Indentation depth in the documentation.
+    documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+    str: Generated text documentation for the field.
+    """
+    indent = '    ' * depth
+
+    field_info = model.model_fields.get(field_name)
+    field_description = field_info.description if field_info and field_info.description else ""
+
+    if get_origin(field_type) == list:
+        element_type = get_args(field_type)[0]
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    elif get_origin(field_type) == Union:
+        element_types = get_args(field_type)
+        types = []
+        for element_type in element_types:
+            types.append(format_model_and_field_name(element_type.__name__))
+        field_text = f"{indent}{field_name} ({' or '.join(types)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    else:
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+
+    if not documentation_with_field_description:
+        return field_text
+
+    if field_description != "":
+        field_text += f"{indent}  Description: " + field_description + "\n"
+
+    # Check for and include field-specific examples if available
+    if hasattr(model, 'Config') and hasattr(model.Config,
+                                            'json_schema_extra') and 'example' in model.Config.json_schema_extra:
+        field_example = model.Config.json_schema_extra['example'].get(field_name)
+        if field_example is not None:
+            example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
+            field_text += f"{indent}  Example: {example_text}\n"
+
+    if isclass(field_type) and issubclass(field_type, BaseModel):
+        field_text += f"{indent}  Details:\n"
+        for name, type_ in field_type.__annotations__.items():
+            field_text += generate_field_text(name, type_, field_type, depth + 2)
+
+    return field_text
+
+
+def format_multiline_description(description: str, indent_level: int) -> str:
+    """
+    Format a multiline description with proper indentation.
+
+    Args:
+    description (str): Multiline description.
+    indent_level (int): Indentation level.
+
+    Returns:
+    str: Formatted multiline description.
+    """
+    indent = '    ' * indent_level
+    return indent + description.replace('\n', '\n' + indent)
+
+
+def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf",
+                                        documentation_file_path="./grammar_documentation.md"):
+    """
+    Save GBNF grammar and documentation to specified files.
+
+    Args:
+    grammar (str): GBNF grammar string.
+    documentation (str): Documentation string.
+    grammar_file_path (str): File path to save the GBNF grammar.
+    documentation_file_path (str): File path to save the documentation.
+
+    Returns:
+    None
+    """
+    try:
+        with open(grammar_file_path, 'w') as file:
+            file.write(grammar + get_primitive_grammar(grammar))
+        print(f"Grammar successfully saved to {grammar_file_path}")
+    except IOError as e:
+        print(f"An error occurred while saving the grammar file: {e}")
+
+    try:
+        with open(documentation_file_path, 'w') as file:
+            file.write(documentation)
+        print(f"Documentation successfully saved to {documentation_file_path}")
+    except IOError as e:
+        print(f"An error occurred while saving the documentation file: {e}")
+
+
+def remove_empty_lines(string):
+    """
+    Remove empty lines from a string.
+
+    Args:
+    string (str): Input string.
+
+    Returns:
+    str: String with empty lines removed.
+    """
+    lines = string.splitlines()
+    non_empty_lines = [line for line in lines if line.strip() != ""]
+    string_no_empty_lines = "\n".join(non_empty_lines)
+    return string_no_empty_lines
+
+
+def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list,
+                                                     grammar_file_path="./generated_grammar.gbnf",
+                                                     documentation_file_path="./generated_grammar_documentation.md",
+                                                     outer_object_name: str = None,
+                                                     outer_object_content: str = None,
+                                                     model_prefix: str = "Output Model",
+                                                     fields_prefix: str = "Output Fields",
+                                                     list_of_outputs: bool = False,
+                                                     documentation_with_field_description=True):
+    """
+    Generate GBNF grammar and documentation, and save them to specified files.
+
+    Args:
+    pydantic_model_list: List of Pydantic model classes.
+    grammar_file_path (str): File path to save the generated GBNF grammar.
+    documentation_file_path (str): File path to save the generated documentation.
+    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+    model_prefix (str): Prefix for the model section in the documentation.
+    fields_prefix (str): Prefix for the fields section in the documentation.
+    list_of_outputs (bool): Whether the output is a list of items.
+    documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+    None
+    """
+    documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix,
+                                                documentation_with_field_description=documentation_with_field_description)
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
+                                                         outer_object_content, list_of_outputs)
+    grammar = remove_empty_lines(grammar)
+    save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path)
+
+
+def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None,
+                                            outer_object_content: str = None,
+                                            model_prefix: str = "Output Model",
+                                            fields_prefix: str = "Output Fields", list_of_outputs: bool = False,
+                                            documentation_with_field_description=True):
+    """
+    Generate GBNF grammar and documentation for a list of Pydantic models.
+
+    Args:
+    pydantic_model_list: List of Pydantic model classes.
+    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+    model_prefix (str): Prefix for the model section in the documentation.
+    fields_prefix (str): Prefix for the fields section in the documentation.
+    list_of_outputs (bool): Whether the output is a list of items.
+    documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+    tuple: GBNF grammar string, documentation string.
+    """
+    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
+                                                documentation_with_field_description=documentation_with_field_description)
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
+                                                         outer_object_content, list_of_outputs)
+    grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
+    return grammar, documentation
+
+
+def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict],
+                                                              outer_object_name: str = None,
+                                                              outer_object_content: str = None,
+                                                              model_prefix: str = "Output Model",
+                                                              fields_prefix: str = "Output Fields",
+                                                              list_of_outputs: bool = False,
+                                                              documentation_with_field_description=True):
+    """
+    Generate GBNF grammar and documentation from a list of dictionaries.
+
+    Args:
+    dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+    model_prefix (str): Prefix for the model section in the documentation.
+    fields_prefix (str): Prefix for the fields section in the documentation.
+    list_of_outputs (bool): Whether the output is a list of items.
+    documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+    tuple: GBNF grammar string, documentation string.
+    """
+    pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries)
+    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
+                                                documentation_with_field_description=documentation_with_field_description)
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
+                                                         outer_object_content, list_of_outputs)
+    grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
+    return grammar, documentation
+
+
+def create_dynamic_model_from_function(func: Callable):
+    """
+    Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
+
+    Args:
+    func (Callable): A function with type hints from which to create the model.
+
+    Returns:
+    A dynamic Pydantic model class with the provided function as a 'run' method.
+    """
+    # Extracting type hints from the provided function
+    type_hints = get_type_hints(func)
+    type_hints.pop('return', None)
+
+    # Handling default values and annotations
+    dynamic_fields = {}
+    defaults = getattr(func, '__defaults__', ()) or ()
+    defaults_index = len(type_hints) - len(defaults)
+
+    for index, (name, typ) in enumerate(type_hints.items()):
+        if index >= defaults_index:
+            default_value = defaults[index - defaults_index]
+            dynamic_fields[name] = (typ, default_value)
+        else:
+            dynamic_fields[name] = (typ, ...)
+
+    # Creating the dynamic model
+    dynamicModel = create_model(f'{func.__name__}', **dynamic_fields)
+
+    dynamicModel.__doc__ = getdoc(func)
+
+    # Wrapping the original function to handle instance 'self'
+    def run_method_wrapper(self):
+        func_args = {name: getattr(self, name) for name in type_hints}
+        return func(**func_args)
+
+    # Adding the wrapped function as a 'run' method
+    setattr(dynamicModel, 'run', run_method_wrapper)
+
+    return dynamicModel
+
+
+def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
+    """
+    Add a 'run' method to a dynamic Pydantic model, using the provided function.
+
+    Args:
+    - model (Type[BaseModel]): Dynamic Pydantic model class.
+    - func (Callable): Function to be added as a 'run' method to the model.
+
+    Returns:
+    - Type[BaseModel]: Pydantic model class with the added 'run' method.
+    """
+
+    def run_method_wrapper(self):
+        func_args = {name: getattr(self, name) for name in model.model_fields}
+        return func(**func_args)
+
+    # Adding the wrapped function as a 'run' method
+    setattr(model, 'run', run_method_wrapper)
+
+    return model
+
+
+def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
+    """
+    Create a list of dynamic Pydantic model classes from a list of dictionaries.
+
+    Args:
+    - dictionaries (List[dict]): List of dictionaries representing model structures.
+
+    Returns:
+    - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+    """
+    dynamic_models = []
+    for func in dictionaries:
+        model_name = format_model_and_field_name(func.get("name", ""))
+        dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name)
+        dynamic_models.append(dyn_model)
+    return dynamic_models
+
+
+def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
+    output = {}
+    for model in pydantic_model_list:
+        output[format_model_and_field_name(model.__name__)] = model
+
+    return output
+
+
+from enum import Enum
+
+
+def json_schema_to_python_types(schema):
+    type_map = {
+        'any': Any,
+        'string': str,
+        'number': float,
+        'integer': int,
+        'boolean': bool,
+        'array': list,
+    }
+    return type_map[schema]
+
+
+def list_to_enum(enum_name, values):
+    return Enum(enum_name, {value: value for value in values})
+
+
+def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]:
+    """
+    Convert a dictionary to a Pydantic model class.
+
+    Args:
+    - dictionary (dict): Dictionary representing the model structure.
+    - model_name (str): Name of the generated Pydantic model.
+
+    Returns:
+    - Type[BaseModel]: Generated Pydantic model class.
+    """
+    fields = {}
+
+    if "properties" in dictionary:
+        for field_name, field_data in dictionary.get("properties", {}).items():
+            if field_data == 'object':
+                submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}')
+                fields[field_name] = (submodel, ...)
+            else:
+                field_type = field_data.get('type', 'str')
+
+                if field_data.get("enum", []):
+                    fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...)
+                if field_type == "array":
+                    items = field_data.get("items", {})
+                    if items != {}:
+                        array = {"properties": items}
+                        array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items')
+                        fields[field_name] = (List[array_type], ...)
+                    else:
+                        fields[field_name] = (list, ...)
+                elif field_type == 'object':
+                    submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}')
+                    fields[field_name] = (submodel, ...)
+                else:
+                    field_type = json_schema_to_python_types(field_type)
+                    fields[field_name] = (field_type, ...)
+    if "function" in dictionary:
+
+        for field_name, field_data in dictionary.get("function", {}).items():
+            if field_name == "name":
+                model_name = field_data
+            elif field_name == "description":
+                fields["__doc__"] = field_data
+            elif field_name == "parameters":
+                return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
+    if "parameters" in dictionary:
+        field_data = {"function": dictionary}
+        return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
+
+    custom_model = create_model(model_name, **fields)
+    return custom_model
+
+
+

From fa5c1fb44a2724292da545d6b7cf2a1ac0e0b989 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 12 Jan 2024 20:38:34 +0100
Subject: [PATCH 003/138] backend_sched : fix assignments

ggml-ci
---
 ggml-backend.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/ggml-backend.c b/ggml-backend.c
index 4c2d8b0b2..505dbba47 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1087,6 +1087,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             }
         }
     }
+
+    // pass 2.4 expand rest down
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                cur_allocr = node_allocr;
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.4");
+            }
+        }
+    }
 #ifdef DEBUG_PASS2
     fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
 #endif
@@ -1146,6 +1164,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
 
             ggml_tallocr_t node_allocr = node_allocr(node);
 
+            GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
+
             if (node_allocr != cur_allocr) {
                 sched->splits[cur_split].i_end = i;
                 cur_split++;

From f238461236f4e0e18cac1a554af23c7deadc9b01 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 12 Jan 2024 14:02:30 +0200
Subject: [PATCH 004/138] ggml : fix 32-bit ARM compat for IQ2_XS
 (whisper/1758)

* ggml : fix 32-bit ARM compat

* ggml : fix fix

* ggml : fix fix fix
---
 ggml-quants.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index a24b4b244..601d155d7 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -272,10 +272,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 
 // vaddvq_s16
 // vpaddq_s16
+// vpaddq_s32
 // vaddvq_s32
 // vaddvq_f32
 // vmaxvq_f32
 // vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
 
 inline static int32_t vaddvq_s16(int16x8_t v) {
     return
@@ -291,6 +294,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
     return vcombine_s16(a0, b0);
 }
 
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
 inline static int32_t vaddvq_s32(int32x4_t v) {
     return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
@@ -316,6 +325,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
     return res;
 }
 
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
 // vld1q_s16_x2
 // vld1q_u8_x2
 // vld1q_u8_x4
@@ -7554,9 +7585,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
 
     const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
 
-    int8x16x4_t q2u;
-    int8x16x4_t q2s;
-    int8x16x4_t q8b;
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
 
     int32x4x4_t scales32;
 
@@ -7578,7 +7609,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
         scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
         int32x4_t sumi = vdupq_n_s32(0);
         for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = vld1q_s8_x4(q8); q8 += 64;
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
             q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
             q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
             q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));

From de473f5f8e19ba5e659cdf5af65fb9251dce16c5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 12 Jan 2024 22:02:43 +0200
Subject: [PATCH 005/138] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 3e2c579d5..edcdb530a 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-979cc23b345006504cfc1f67c0fdf627805e3319
+400c07f00508e6f60fb25405444b5669c365b0a9

From 15ebe59210e7fd9817ff67f51fa1a5ee2d004294 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 13:44:37 +0200
Subject: [PATCH 006/138] convert : update phi-2 to latest HF repo (#4903)

* convert : update phi-2 to latest HF repo

ggml-ci

* py : try to fix flake stuff
---
 convert-hf-to-gguf.py          | 39 +++++++++++++++++++++----------
 gguf-py/gguf/constants.py      |  3 +++
 gguf-py/gguf/tensor_mapping.py |  2 ++
 llama.cpp                      | 42 ++++++++++++++++++++++++++--------
 4 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index a1c79fd47..b133f3b49 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -23,6 +23,15 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf
 
 
+# check for any of the given keys in the dictionary and return the value of the first key found
+def get_key_opts(d, keys):
+    for k in keys:
+        if k in d:
+            return d[k]
+    print(f"Could not find any of {keys}")
+    sys.exit()
+
+
 ###### MODEL DEFINITIONS ######
 
 class SentencePieceTokenTypes(IntEnum):
@@ -257,10 +266,11 @@ class Model:
                 toktypes.append(gguf.TokenType.USER_DEFINED)
             elif reverse_vocab[i] in added_vocab:
                 tokens.append(reverse_vocab[i])
-                if tokenizer.added_tokens_decoder[i].special:
-                    toktypes.append(gguf.TokenType.CONTROL)
-                else:
-                    toktypes.append(gguf.TokenType.USER_DEFINED)
+                if hasattr(tokenizer, "added_tokens_decoder"):
+                    if tokenizer.added_tokens_decoder[i].special:
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
             else:
                 tokens.append(reverse_vocab[i])
                 toktypes.append(gguf.TokenType.NORMAL)
@@ -1068,17 +1078,22 @@ class GPT2Model(Model):
 
 class Phi2Model(Model):
     def set_gguf_parameters(self):
-        block_count = self.hparams["n_layer"]
+        block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
+
+        rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
+        n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
+        n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
 
         self.gguf_writer.add_name("Phi2")
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
+
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(4 * n_embd)
         self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_add_bos_token(False)
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index f0a1c51f8..972b4e9a7 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -389,6 +389,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
         MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.FFN_DOWN,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 24a089037..e5b146106 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -191,6 +191,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.w1",                             # qwen
             "h.{bid}.mlp.c_fc",                                       # gpt2
             "transformer.h.{bid}.mlp.fc1",                            # phi2
+            "model.layers.{bid}.mlp.fc1",                             # phi2
             "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
         ),
 
@@ -232,6 +233,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
             "h.{bid}.mlp.c_proj",                                     # gpt2
             "transformer.h.{bid}.mlp.fc2",                            # phi2
+            "model.layers.{bid}.mlp.fc2",                             # phi2
             "model.layers.layers.{bid}.mlp.down_proj",                # plamo
         ),
 
diff --git a/llama.cpp b/llama.cpp
index fe1d8947c..1d2eb569f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -574,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_OUTPUT,          "output" },
             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
             { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
@@ -3676,8 +3679,19 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
+
+                        if (layer.wqkv == nullptr) {
+                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+                            layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
+
+                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+                            layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa});
+
+                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+                            layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa});
+                        }
 
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
                         layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
@@ -5637,15 +5651,25 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
-                cb(cur, "wqkv", il);
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
 
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
+                if (model.layers[il].wqkv) {
+                    cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cb(cur, "wqkv", il);
 
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                }
 
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);

From ee8243adaa9a9f51ff449213383874e49efe368f Mon Sep 17 00:00:00 2001
From: makomk <makosoft@googlemail.com>
Date: Sat, 13 Jan 2024 14:16:11 +0000
Subject: [PATCH 007/138] server : fix crash with multimodal models without BOS
 token (#4904)

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c1ab8f9dc..7b33aea1f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1835,7 +1835,7 @@ struct llama_server_context
 
                     slot.cache_tokens = prompt_tokens;
 
-                    if (slot.n_past == slot.num_prompt_tokens)
+                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
                         LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);

From 356327feb3f66980ab687040495d722696d98970 Mon Sep 17 00:00:00 2001
From: Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Date: Sat, 13 Jan 2024 09:20:46 -0500
Subject: [PATCH 008/138] server : fix deadlock that occurs in multi-prompt
 scenarios (#4905)

* * fix deadlock

* * dont ruint all whitespace
---
 examples/server/server.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7b33aea1f..79eacf828 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1350,14 +1350,17 @@ struct llama_server_context
             res.result_json["model"] = slot.oaicompat_model;
         }
 
+        queue_results.push_back(res);
+        condition_results.notify_all();
+
+        // done with results, unlock
+        lock.unlock();
+
         // parent multitask, if any, needs to be updated
         if (slot.multitask_id != -1)
         {
             update_multi_task(slot.multitask_id, slot.task_id, res);
         }
-
-        queue_results.push_back(res);
-        condition_results.notify_all();
     }
 
     void send_embedding(llama_client_slot &slot)
@@ -1603,6 +1606,7 @@ struct llama_server_context
         }
 
         // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
+        std::vector<task_result> agg_results;
         auto queue_iterator = queue_multitasks.begin();
         while (queue_iterator != queue_multitasks.end())
         {
@@ -1623,8 +1627,9 @@ struct llama_server_context
                 }
                 aggregate_result.result_json = json{ "results", result_jsons };
 
-                std::lock_guard<std::mutex> lock(mutex_results);
-                queue_results.push_back(aggregate_result);
+
+                agg_results.push_back(aggregate_result);
+
                 condition_results.notify_all();
 
                 queue_iterator = queue_multitasks.erase(queue_iterator);
@@ -1634,6 +1639,13 @@ struct llama_server_context
                 ++queue_iterator;
             }
         }
+
+        // done with tasks, unlock
+        lock.unlock();
+
+        // copy aggregate results of complete multi-tasks to the results queue
+        std::lock_guard<std::mutex> lock_results(mutex_results);
+        queue_results.insert(queue_results.end(), agg_results.begin(), agg_results.end());
     }
 
     bool update_slots() {

From 7dc78764e2ff86512e6e31cb0fcb8087df4b4708 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 13 Jan 2024 15:52:53 +0100
Subject: [PATCH 009/138] compare-llama-bench: tweak output format (#4910)

---
 scripts/compare-llama-bench.py | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index bc1714487..70737f976 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -10,15 +10,15 @@ import sqlite3
 try:
     import git
     from tabulate import tabulate
-except ImportError:
+except ImportError as e:
     print("ERROR: the following Python libraries are required: GitPython, tabulate.")
-    sys.exit(1)
+    raise e
 
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename",
-    "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v",
-    "n_gpu_layers", "main_gpu", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
+    "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
+    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
+    "type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
 ]
 
 # Properties that are boolean and are converted to Yes/No for the table:
@@ -37,6 +37,7 @@ PRETTY_NAMES = {
 DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.
 DEFAULT_HIDE = ["model_filename"]  # Always hide these properties by default.
 GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
+MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
 
 DESCRIPTION = """Creates tables from llama-bench data written to an SQLite database. Example usage (Linux):
 
@@ -308,8 +309,13 @@ else:
         if gpu_blas and "gpu_info" not in properties_different:
             show.append("gpu_info")
 
-    show += DEFAULT_SHOW
     show += properties_different
+
+    index_default = 0
+    for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
+        if prop in show:
+            index_default += 1
+    show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
     for prop in DEFAULT_HIDE:
         try:
             show.remove(prop)
@@ -334,6 +340,12 @@ for bool_property in BOOL_PROPERTIES:
         for row_table in table:
             row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
 
+if "model_type" in show:
+    ip = show.index("model_type")
+    for (old, new) in MODEL_SUFFIX_REPLACE.items():
+        for row_table in table:
+            row_table[ip] = row_table[ip].replace(old, new)
+
 if "model_size" in show:
     ip = show.index("model_size")
     for row_table in table:
@@ -341,10 +353,16 @@ if "model_size" in show:
 
 if "gpu_info" in show:
     ip = show.index("gpu_info")
-    for gns in GPU_NAME_STRIP:
-        for row_table in table:
+    for row_table in table:
+        for gns in GPU_NAME_STRIP:
             row_table[ip] = row_table[ip].replace(gns, "")
 
+        gpu_names = row_table[ip].split("/")
+        num_gpus = len(gpu_names)
+        all_names_the_same = len(set(gpu_names)) == 1
+        if len(gpu_names) >= 2 and all_names_the_same:
+            row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
+
 headers  = [PRETTY_NAMES[p] for p in show]
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
 

From b38b5e93ae31019e87f692b69d27124eae6aac02 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 18:03:45 +0200
Subject: [PATCH 010/138] metal : refactor kernel loading code (#4794)

* metal : detect more GPU families

* metal : refactor kernel loading

* metal : set kernel family requirements

* metal : fix kernel init + fix compile options

* metal : take into account simdgroup reduction support

* metal : print only skipped kernels

* metal : fix check for simdgroup reduction support

* metal : check for Metal 3

* metal : free allocations

* metal : normalize encoder:setComputePipelineStatus calls

ggml-ci

* metal : fix Metal3 family check

ggml-ci

* metal : check for simdgroup matrix mul. feature

ggml-ci
---
 ggml-metal.m | 1048 +++++++++++++++++++++++++-------------------------
 1 file changed, 530 insertions(+), 518 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index c03624073..6c28a7ee3 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -26,6 +26,8 @@
 
 #define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
 
+#define GGML_METAL_MAX_KERNELS 256
+
 struct ggml_metal_buffer {
     const char * name;
 
@@ -35,6 +37,134 @@ struct ggml_metal_buffer {
     id<MTLBuffer> metal;
 };
 
+struct ggml_metal_kernel {
+    id<MTLFunction>             function;
+    id<MTLComputePipelineState> pipeline;
+};
+
+enum ggml_metal_kernel_type {
+    GGML_METAL_KERNEL_TYPE_ADD,
+    GGML_METAL_KERNEL_TYPE_ADD_ROW,
+    GGML_METAL_KERNEL_TYPE_MUL,
+    GGML_METAL_KERNEL_TYPE_MUL_ROW,
+    GGML_METAL_KERNEL_TYPE_DIV,
+    GGML_METAL_KERNEL_TYPE_DIV_ROW,
+    GGML_METAL_KERNEL_TYPE_SCALE,
+    GGML_METAL_KERNEL_TYPE_SCALE_4,
+    GGML_METAL_KERNEL_TYPE_TANH,
+    GGML_METAL_KERNEL_TYPE_RELU,
+    GGML_METAL_KERNEL_TYPE_GELU,
+    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
+    GGML_METAL_KERNEL_TYPE_SILU,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
+    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
+    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
+    GGML_METAL_KERNEL_TYPE_RMS_NORM,
+    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
+    GGML_METAL_KERNEL_TYPE_NORM,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_F16,
+    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
+    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
+    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
+    GGML_METAL_KERNEL_TYPE_PAD_F32,
+    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
+    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
+    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
+  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
+  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
+    GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
+    GGML_METAL_KERNEL_TYPE_CONCAT,
+    GGML_METAL_KERNEL_TYPE_SQR,
+    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+
+    GGML_METAL_KERNEL_TYPE_COUNT
+};
+
 struct ggml_metal_context {
     int n_cb;
 
@@ -50,134 +180,13 @@ struct ggml_metal_context {
     int n_buffers;
     struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
+    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
+
     int concur_list[GGML_MAX_CONCUR];
     int concur_list_len;
 
-    // custom kernels
-#define GGML_METAL_DECL_KERNEL(name) \
-    id<MTLFunction>             function_##name; \
-    id<MTLComputePipelineState> pipeline_##name
-
-    GGML_METAL_DECL_KERNEL(add);
-    GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
-    GGML_METAL_DECL_KERNEL(mul);
-    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
-    GGML_METAL_DECL_KERNEL(div);
-    GGML_METAL_DECL_KERNEL(div_row);
-    GGML_METAL_DECL_KERNEL(scale);
-    GGML_METAL_DECL_KERNEL(scale_4);
-    GGML_METAL_DECL_KERNEL(tanh);
-    GGML_METAL_DECL_KERNEL(relu);
-    GGML_METAL_DECL_KERNEL(gelu);
-    GGML_METAL_DECL_KERNEL(gelu_quick);
-    GGML_METAL_DECL_KERNEL(silu);
-    GGML_METAL_DECL_KERNEL(soft_max);
-    GGML_METAL_DECL_KERNEL(soft_max_4);
-    GGML_METAL_DECL_KERNEL(diag_mask_inf);
-    GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
-    GGML_METAL_DECL_KERNEL(get_rows_f32);
-    GGML_METAL_DECL_KERNEL(get_rows_f16);
-    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
-    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
-    GGML_METAL_DECL_KERNEL(get_rows_q5_0);
-    GGML_METAL_DECL_KERNEL(get_rows_q5_1);
-    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
-    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
-    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
-    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
-    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
-    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
-    GGML_METAL_DECL_KERNEL(get_rows_i32);
-    GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
-    GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
-    GGML_METAL_DECL_KERNEL(rms_norm);
-    GGML_METAL_DECL_KERNEL(group_norm);
-    GGML_METAL_DECL_KERNEL(norm);
-    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
-    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
-    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row);
-    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
-    GGML_METAL_DECL_KERNEL(rope_f32);
-    GGML_METAL_DECL_KERNEL(rope_f16);
-    GGML_METAL_DECL_KERNEL(alibi_f32);
-    GGML_METAL_DECL_KERNEL(im2col_f16);
-    GGML_METAL_DECL_KERNEL(upscale_f32);
-    GGML_METAL_DECL_KERNEL(pad_f32);
-    GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
-    GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
-    GGML_METAL_DECL_KERNEL(leaky_relu_f32);
-    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
-    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
-    GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
-    GGML_METAL_DECL_KERNEL(cpy_f32_q4_0);
-    GGML_METAL_DECL_KERNEL(cpy_f32_q4_1);
-    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
-    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
-    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
-    GGML_METAL_DECL_KERNEL(cpy_f16_f32);
-    GGML_METAL_DECL_KERNEL(concat);
-    GGML_METAL_DECL_KERNEL(sqr);
-    GGML_METAL_DECL_KERNEL(sum_rows);
-
-#undef GGML_METAL_DECL_KERNEL
+    bool support_simdgroup_reduction;
+    bool support_simdgroup_mm;
 };
 
 // MSL code
@@ -298,19 +307,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
                 return NULL;
             }
 
-            MTLCompileOptions* options = nil;
+            // dictionary of preprocessor macros
+            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+
 #ifdef GGML_QKK_64
-            options = [MTLCompileOptions new];
-            options.preprocessorMacros = @{ @"QK_K" : @(64) };
+            prep[@"QK_K"] = @(64);
 #endif
-            // try to disable fast-math
-            // NOTE: this seems to have no effect whatsoever
-            //       instead, in order to disable fast-math, we have to build default.metallib from the command line
-            //       using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-            //       and go through the "pre-compiled library found" path above
+
+            MTLCompileOptions* options = [MTLCompileOptions new];
+            options.preprocessorMacros = prep;
+
             //[options setFastMathEnabled:false];
 
             ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+
+            [options release];
+            [prep release];
         }
 
         if (error) {
@@ -323,16 +335,41 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     // print MTL GPU family:
     GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
 
+    const NSInteger MTLGPUFamilyMetal3 = 5001;
+
     // determine max supported GPU family
     // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
     // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-        if ([ctx->device supportsFamily:i]) {
-            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
-            break;
+    {
+        for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
+            if ([ctx->device supportsFamily:i]) {
+                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+                break;
+            }
+        }
+
+        for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
+            if ([ctx->device supportsFamily:i]) {
+                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
+                break;
+            }
+        }
+
+        for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
+            if ([ctx->device supportsFamily:i]) {
+                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
+                break;
+            }
         }
     }
 
+    ctx->support_simdgroup_reduction  = [ctx->device supportsFamily:MTLGPUFamilyApple7];
+    ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3];
+
+    ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
+
+    GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
+    GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
     if (ctx->device.maxTransferRate != 0) {
@@ -346,141 +383,152 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     {
         NSError * error = nil;
 
+        for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
+            ctx->kernels[i].function = nil;
+            ctx->kernels[i].pipeline = nil;
+        }
+
         /*
-        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
-                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
-                (int) ctx->pipeline_##name.threadExecutionWidth); \
+            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
+                    (int) kernel->pipeline.threadExecutionWidth); \
         */
-#define GGML_METAL_ADD_KERNEL(name) \
-        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        if (error) { \
-            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
-            return NULL; \
+#define GGML_METAL_ADD_KERNEL(e, name, supported) \
+        if (supported) { \
+            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
+            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
+            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
+                    (int) kernel->pipeline.threadExecutionWidth); \
+            if (error) { \
+                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+                return NULL; \
+            } \
+        } else { \
+            GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \
         }
 
-        GGML_METAL_ADD_KERNEL(add);
-        GGML_METAL_ADD_KERNEL(add_row);
-        GGML_METAL_ADD_KERNEL(mul);
-        GGML_METAL_ADD_KERNEL(mul_row);
-        GGML_METAL_ADD_KERNEL(div);
-        GGML_METAL_ADD_KERNEL(div_row);
-        GGML_METAL_ADD_KERNEL(scale);
-        GGML_METAL_ADD_KERNEL(scale_4);
-        GGML_METAL_ADD_KERNEL(tanh);
-        GGML_METAL_ADD_KERNEL(relu);
-        GGML_METAL_ADD_KERNEL(gelu);
-        GGML_METAL_ADD_KERNEL(gelu_quick);
-        GGML_METAL_ADD_KERNEL(silu);
-        GGML_METAL_ADD_KERNEL(soft_max);
-        GGML_METAL_ADD_KERNEL(soft_max_4);
-        GGML_METAL_ADD_KERNEL(diag_mask_inf);
-        GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
-        GGML_METAL_ADD_KERNEL(get_rows_f32);
-        GGML_METAL_ADD_KERNEL(get_rows_f16);
-        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
-        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
-        GGML_METAL_ADD_KERNEL(get_rows_q5_0);
-        GGML_METAL_ADD_KERNEL(get_rows_q5_1);
-        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
-        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
-        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
-        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
-        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
-        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
-        GGML_METAL_ADD_KERNEL(get_rows_i32);
-        GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
-        GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
-        GGML_METAL_ADD_KERNEL(rms_norm);
-        GGML_METAL_ADD_KERNEL(group_norm);
-        GGML_METAL_ADD_KERNEL(norm);
-        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
-        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
-        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row);
-        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
-        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
-        }
-        GGML_METAL_ADD_KERNEL(rope_f32);
-        GGML_METAL_ADD_KERNEL(rope_f16);
-        GGML_METAL_ADD_KERNEL(alibi_f32);
-        GGML_METAL_ADD_KERNEL(im2col_f16);
-        GGML_METAL_ADD_KERNEL(upscale_f32);
-        GGML_METAL_ADD_KERNEL(pad_f32);
-        GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
-        GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
-        GGML_METAL_ADD_KERNEL(leaky_relu_f32);
-        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
-        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
-        GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
-        GGML_METAL_ADD_KERNEL(cpy_f32_q4_0);
-        GGML_METAL_ADD_KERNEL(cpy_f32_q4_1);
-        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
-        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
-        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
-        GGML_METAL_ADD_KERNEL(cpy_f16_f32);
-        GGML_METAL_ADD_KERNEL(concat);
-        GGML_METAL_ADD_KERNEL(sqr);
-        GGML_METAL_ADD_KERNEL(sum_rows);
+        // simd_sum and simd_max requires MTLGPUFamilyApple7
 
-#undef GGML_METAL_ADD_KERNEL
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                       add,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                   add_row,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                       mul,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                   mul_row,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                       div,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,           diag_mask_inf_8,        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,              get_rows_f32,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,              get_rows_f16,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,             get_rows_q4_0,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,             get_rows_q4_1,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,             get_rows_q5_0,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,             get_rows_q5_1,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,             get_rows_q8_0,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,             get_rows_q2_K,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,             get_rows_q3_K,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,             get_rows_q4_K,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,             get_rows_q5_K,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                      norm,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,            mul_mv_f32_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,            mul_mv_f16_f16,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,            mul_mv_f16_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,       mul_mv_f16_f32_1row,    ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,         mul_mv_f16_f32_l4,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,           mul_mv_q4_0_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,           mul_mv_q4_1_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,           mul_mv_q5_0_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,           mul_mv_q5_1_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,           mul_mv_q8_0_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,           mul_mv_q2_K_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,           mul_mv_q3_K_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,           mul_mv_q4_K_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,           mul_mv_q5_K_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,        ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,    mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,      mul_mv_id_f16_f32_l4,   ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,        mul_mv_id_q4_0_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,        mul_mv_id_q4_1_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,        mul_mv_id_q5_0_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,        mul_mv_id_q5_1_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,        mul_mv_id_q8_0_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,        mul_mv_id_q2_K_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,        mul_mv_id_q3_K_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,        mul_mv_id_q4_K_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,        mul_mv_id_q5_K_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,           mul_mm_q4_1_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,           mul_mm_q5_0_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,           mul_mm_q5_1_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,           mul_mm_q8_0_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,           mul_mm_q2_K_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,           mul_mm_q3_K_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,           mul_mm_q4_K_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,           mul_mm_q5_K_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,        mul_mm_id_q4_1_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,        mul_mm_id_q5_0_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,        mul_mm_id_q5_1_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,        mul_mm_id_q8_0_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,        mul_mm_id_q2_K_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,        mul_mm_id_q3_K_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,        mul_mm_id_q4_K_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,        mul_mm_id_q5_K_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,     ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                im2col_f16,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,              cpy_f32_q4_0,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,              cpy_f32_q4_1,           true);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,              cpy_f32_q5_0,           true);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,              cpy_f32_q5_1,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,               cpy_f16_f16,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,               cpy_f16_f32,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                    concat,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                       sqr,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,               true);
     }
 
     return ctx;
@@ -488,137 +536,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
 void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
-#define GGML_METAL_DEL_KERNEL(name) \
-    [ctx->function_##name release]; \
-    [ctx->pipeline_##name release];
-
-    GGML_METAL_DEL_KERNEL(add);
-    GGML_METAL_DEL_KERNEL(add_row);
-    GGML_METAL_DEL_KERNEL(mul);
-    GGML_METAL_DEL_KERNEL(mul_row);
-    GGML_METAL_DEL_KERNEL(div);
-    GGML_METAL_DEL_KERNEL(div_row);
-    GGML_METAL_DEL_KERNEL(scale);
-    GGML_METAL_DEL_KERNEL(scale_4);
-    GGML_METAL_DEL_KERNEL(tanh);
-    GGML_METAL_DEL_KERNEL(relu);
-    GGML_METAL_DEL_KERNEL(gelu);
-    GGML_METAL_DEL_KERNEL(gelu_quick);
-    GGML_METAL_DEL_KERNEL(silu);
-    GGML_METAL_DEL_KERNEL(soft_max);
-    GGML_METAL_DEL_KERNEL(soft_max_4);
-    GGML_METAL_DEL_KERNEL(diag_mask_inf);
-    GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
-    GGML_METAL_DEL_KERNEL(get_rows_f32);
-    GGML_METAL_DEL_KERNEL(get_rows_f16);
-    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
-    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
-    GGML_METAL_DEL_KERNEL(get_rows_q5_0);
-    GGML_METAL_DEL_KERNEL(get_rows_q5_1);
-    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
-    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
-    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
-    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
-    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
-    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
-    GGML_METAL_DEL_KERNEL(get_rows_i32);
-    GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
-    GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
-    GGML_METAL_DEL_KERNEL(rms_norm);
-    GGML_METAL_DEL_KERNEL(group_norm);
-    GGML_METAL_DEL_KERNEL(norm);
-    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
-    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
-    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row);
-    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
-    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
-    }
-    GGML_METAL_DEL_KERNEL(rope_f32);
-    GGML_METAL_DEL_KERNEL(rope_f16);
-    GGML_METAL_DEL_KERNEL(alibi_f32);
-    GGML_METAL_DEL_KERNEL(im2col_f16);
-    GGML_METAL_DEL_KERNEL(upscale_f32);
-    GGML_METAL_DEL_KERNEL(pad_f32);
-    GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
-    GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
-    GGML_METAL_DEL_KERNEL(leaky_relu_f32);
-    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
-    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
-    GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
-    GGML_METAL_DEL_KERNEL(cpy_f32_q4_0);
-    GGML_METAL_DEL_KERNEL(cpy_f32_q4_1);
-    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
-    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
-    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
-    GGML_METAL_DEL_KERNEL(cpy_f16_f32);
-    GGML_METAL_DEL_KERNEL(concat);
-    GGML_METAL_DEL_KERNEL(sqr);
-    GGML_METAL_DEL_KERNEL(sum_rows);
-
-#undef GGML_METAL_DEL_KERNEL
 
     for (int i = 0; i < ctx->n_buffers; ++i) {
         [ctx->buffers[i].metal release];
     }
 
+    for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
+        if (ctx->kernels[i].pipeline) {
+            [ctx->kernels[i].pipeline release];
+        }
+
+        if (ctx->kernels[i].function) {
+            [ctx->kernels[i].function release];
+        }
+    }
+
     [ctx->library release];
     [ctx->queue release];
     [ctx->device release];
@@ -930,7 +862,7 @@ void ggml_metal_graph_find_concurrency(
     }
 }
 
-static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
+static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -956,9 +888,11 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SUM_ROWS:
+            return true;
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
         case GGML_OP_GROUP_NORM:
+            return ctx->support_simdgroup_reduction;
         case GGML_OP_NORM:
         case GGML_OP_ALIBI:
         case GGML_OP_ROPE:
@@ -967,9 +901,10 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
         case GGML_OP_PAD:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
+            return true;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
-            return true;
+            return ctx->support_simdgroup_reduction;
         case GGML_OP_CPY:
         case GGML_OP_DUP:
         case GGML_OP_CONT:
@@ -1007,6 +942,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
             return false;
     }
 }
+
 bool ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
@@ -1077,7 +1013,7 @@ bool ggml_metal_graph_compute(
                         } break;
                 }
 
-                if (!ggml_metal_supports_op(dst)) {
+                if (!ggml_metal_supports_op(ctx, dst)) {
                     GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
                     GGML_ASSERT(!"unsupported op");
                 }
@@ -1143,7 +1079,9 @@ bool ggml_metal_graph_compute(
                         {
                             const int64_t nb = ne00;
 
-                            [encoder setComputePipelineState:ctx->pipeline_concat];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1197,18 +1135,18 @@ bool ggml_metal_graph_compute(
 
                                 nb = ne00 / 4;
                                 switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break;
-                                    case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break;
-                                    case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break;
+                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
+                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
+                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
                                     default: GGML_ASSERT(false);
                                 }
 
                                 bcast_row = true;
                             } else {
                                 switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->pipeline_add; break;
-                                    case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break;
-                                    case GGML_OP_DIV: pipeline = ctx->pipeline_div; break;
+                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
+                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
                                     default: GGML_ASSERT(false);
                                 }
                             }
@@ -1275,9 +1213,9 @@ bool ggml_metal_graph_compute(
                                 // not sure how to avoid this
                                 // TODO: make a simpler cpy_bytes kernel
 
-                                const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
+                                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
 
-                                [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
+                                [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                                 [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                                 [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
@@ -1297,10 +1235,14 @@ bool ggml_metal_graph_compute(
                                 [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
                                 [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
 
+                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
                                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                             }
 
-                            [encoder setComputePipelineState:ctx->pipeline_add];
+                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1330,7 +1272,7 @@ bool ggml_metal_graph_compute(
                             [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
                             [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
 
-                            const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
+                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
@@ -1342,13 +1284,16 @@ bool ggml_metal_graph_compute(
 
                             int64_t n = ggml_nelements(dst);
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             if (n % 4 == 0) {
                                 n /= 4;
-                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
                             } else {
-                                [encoder setComputePipelineState:ctx->pipeline_scale];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
                             }
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
@@ -1359,7 +1304,9 @@ bool ggml_metal_graph_compute(
                         switch (ggml_get_unary_op(gf->nodes[i])) {
                             case GGML_UNARY_OP_TANH:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_tanh];
+                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
+
+                                    [encoder setComputePipelineState:pipeline];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
@@ -1369,7 +1316,9 @@ bool ggml_metal_graph_compute(
                                 } break;
                             case GGML_UNARY_OP_RELU:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
+
+                                    [encoder setComputePipelineState:pipeline];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
@@ -1379,7 +1328,9 @@ bool ggml_metal_graph_compute(
                                 } break;
                             case GGML_UNARY_OP_GELU:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
+                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+
+                                    [encoder setComputePipelineState:pipeline];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
@@ -1390,7 +1341,9 @@ bool ggml_metal_graph_compute(
                                 } break;
                             case GGML_UNARY_OP_GELU_QUICK:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_gelu_quick];
+                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+
+                                    [encoder setComputePipelineState:pipeline];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
@@ -1401,7 +1354,9 @@ bool ggml_metal_graph_compute(
                                 } break;
                             case GGML_UNARY_OP_SILU:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+
+                                    [encoder setComputePipelineState:pipeline];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
@@ -1420,18 +1375,23 @@ bool ggml_metal_graph_compute(
                         {
                             GGML_ASSERT(ggml_is_contiguous(src0));
 
-                            [encoder setComputePipelineState:ctx->pipeline_sqr];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
 
                             const int64_t n = ggml_nelements(dst);
+
                             [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case GGML_OP_SUM_ROWS:
                         {
                             GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 
-                            [encoder setComputePipelineState:ctx->pipeline_sum_rows];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
@@ -1465,20 +1425,23 @@ bool ggml_metal_graph_compute(
                         {
                             int nth = 32; // SIMD width
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             if (ne00%4 == 0) {
                                 while (nth < ne00/4 && nth < 256) {
                                     nth *= 2;
                                 }
-                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
                             } else {
                                 while (nth < ne00 && nth < 1024) {
                                     nth *= 2;
                                 }
-                                [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
                             }
 
                             const float scale = ((float *) dst->op_params)[0];
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                             if (id_src1) {
                                 [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
@@ -1498,11 +1461,15 @@ bool ggml_metal_graph_compute(
                         {
                             const int n_past = ((int32_t *)(dst->op_params))[0];
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             if (ne00%8 == 0) {
-                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
                             } else {
-                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
                             }
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
@@ -1562,23 +1529,28 @@ bool ggml_metal_graph_compute(
                                 ne00 % 32 == 0 && ne00 >= 64 &&
                                 (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                                id<MTLComputePipelineState> pipeline = nil;
+
                                 switch (src0->type) {
-                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
-                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
-                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
-                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
-                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
-                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
-                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
-                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
-                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
-                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
-                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
-                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
-                                    case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
-                                    case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
+                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
                                     default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                                 }
+
+                                [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                                 [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                                 [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
@@ -1602,12 +1574,14 @@ bool ggml_metal_graph_compute(
                                 int nrows = 1;
                                 //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
+                                id<MTLComputePipelineState> pipeline = nil;
+
                                 // use custom matrix x vector kernel
                                 switch (src0t) {
                                     case GGML_TYPE_F32:
                                         {
                                             GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
                                             nrows = 4;
                                         } break;
                                     case GGML_TYPE_F16:
@@ -1616,16 +1590,16 @@ bool ggml_metal_graph_compute(
                                             nth1 = 1;
                                             if (src1t == GGML_TYPE_F32) {
                                                 if (ne11 * ne12 < 4) {
-                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
                                                 } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
                                                     nrows = ne11;
                                                 } else {
-                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
                                                     nrows = 4;
                                                 }
                                             } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
                                                 nrows = 4;
                                             }
                                         } break;
@@ -1633,73 +1607,73 @@ bool ggml_metal_graph_compute(
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q4_1:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_0:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_1:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q8_0:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q2_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q3_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q4_K:
                                         {
                                             nth0 = 4; //1;
                                             nth1 = 8; //32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q6_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_IQ2_XXS:
                                         {
                                             nth0 = 4;
                                             nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_IQ2_XS:
                                         {
                                             nth0 = 4;
                                             nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
                                         } break;
                                     default:
                                         {
@@ -1712,6 +1686,7 @@ bool ggml_metal_graph_compute(
                                     GGML_ASSERT(ne00 >= nth0*nth1);
                                 }
 
+                                [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                 [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1818,23 +1793,28 @@ bool ggml_metal_graph_compute(
                             if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                 ne20 % 32 == 0 && ne20 >= 64 &&
                                 ne11 > ne11_mm_min) {
+
+                                id<MTLComputePipelineState> pipeline = nil;
+
                                 switch (src2->type) {
-                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
-                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
-                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break;
-                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break;
-                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break;
-                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break;
-                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break;
-                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break;
-                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break;
-                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
-                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
-                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
-                                    case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
-                                    case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
+                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
                                     default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                                 }
+
+                                [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                                 [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                                 [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
@@ -1874,91 +1854,93 @@ bool ggml_metal_graph_compute(
                                 int nrows = 1;
                                 //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
+                                id<MTLComputePipelineState> pipeline = nil;
+
                                 // use custom matrix x vector kernel
                                 switch (src2t) {
                                     case GGML_TYPE_F32:
                                         {
                                             GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_F16:
                                         {
                                             GGML_ASSERT(src1t == GGML_TYPE_F32);
                                             nth0 = 32;
                                             nth1 = 1;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q4_0:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q4_1:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_0:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_1:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q8_0:
                                         {
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q2_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q3_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q4_K:
                                         {
                                             nth0 = 4; //1;
                                             nth1 = 8; //32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q5_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_Q6_K:
                                         {
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_IQ2_XXS:
                                         {
                                             nth0 = 4;
                                             nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
                                         } break;
                                     case GGML_TYPE_IQ2_XS:
                                         {
                                             nth0 = 4;
                                             nth1 = 16;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
                                         } break;
                                     default:
                                         {
@@ -1973,6 +1955,7 @@ bool ggml_metal_graph_compute(
 
                                 const int64_t _ne1 = 1; // kernels needs a reference in constant memory
 
+                                [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                 [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -2040,25 +2023,28 @@ bool ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_GET_ROWS:
                         {
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             switch (src0->type) {
-                                case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f32];  break;
-                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
-                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
-                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
-                                case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
-                                case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
-                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
-                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
-                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
-                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
-                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
-                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
-                                case GGML_TYPE_I32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_i32];  break;
-                                case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
-                                case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
+                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
+                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
+                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
+                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
+                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
+                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
+                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
+                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
+                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
+                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
+                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
+                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                                case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
                                 default: GGML_ASSERT(false && "not implemented");
                             }
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
                             [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
@@ -2086,7 +2072,9 @@ bool ggml_metal_graph_compute(
                                 nth *= 2;
                             }
 
-                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
@@ -2115,7 +2103,9 @@ bool ggml_metal_graph_compute(
                             //    nth *= 2;
                             //}
 
-                            [encoder setComputePipelineState:ctx->pipeline_group_norm];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
@@ -2137,7 +2127,9 @@ bool ggml_metal_graph_compute(
 
                             const int nth = MIN(256, ne00);
 
-                            [encoder setComputePipelineState:ctx->pipeline_norm];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
@@ -2164,7 +2156,9 @@ bool ggml_metal_graph_compute(
                             const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
                             const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
 
-                            [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
@@ -2209,12 +2203,15 @@ bool ggml_metal_graph_compute(
                             memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                             memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             switch (src0->type) {
-                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
-                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
                                 default: GGML_ASSERT(false);
                             };
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
                             [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
@@ -2277,12 +2274,15 @@ bool ggml_metal_graph_compute(
                             const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
                             const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             switch (src0->type) {
                                 case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
-                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
                                 default: GGML_ASSERT(false);
                             };
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
@@ -2305,7 +2305,9 @@ bool ggml_metal_graph_compute(
 
                             const int sf = dst->op_params[0];
 
-                            [encoder setComputePipelineState:ctx->pipeline_upscale_f32];
+                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
@@ -2326,7 +2328,7 @@ bool ggml_metal_graph_compute(
                             [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                             [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
 
-                            const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
+                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
@@ -2334,7 +2336,9 @@ bool ggml_metal_graph_compute(
                         {
                             GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
-                            [encoder setComputePipelineState:ctx->pipeline_pad_f32];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
@@ -2367,12 +2371,15 @@ bool ggml_metal_graph_compute(
 
                             enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             switch (order) {
-                                case GGML_SORT_ASC:  [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc];  break;
-                                case GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break;
+                                case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                                case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
                                 default: GGML_ASSERT(false);
                             };
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
@@ -2386,7 +2393,9 @@ bool ggml_metal_graph_compute(
                             float slope;
                             memcpy(&slope, dst->op_params, sizeof(float));
 
-                            [encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32];
+                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
                             [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
@@ -2403,33 +2412,36 @@ bool ggml_metal_graph_compute(
 
                             int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
 
+                            id<MTLComputePipelineState> pipeline = nil;
+
                             switch (src0t) {
                                 case GGML_TYPE_F32:
                                     {
                                         GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
 
                                         switch (dstt) {
-                                            case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16];  break;
-                                            case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];  break;
-                                            case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
-                                            case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break;
-                                            case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break;
-                                            //case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break;
-                                            //case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break;
+                                            case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
+                                            case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
+                                            case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
+                                            case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
+                                            case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
+                                          //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
+                                          //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
                                             default: GGML_ASSERT(false && "not implemented");
                                         };
                                     } break;
                                 case GGML_TYPE_F16:
                                     {
                                         switch (dstt) {
-                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
-                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break;
+                                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
+                                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
                                             default: GGML_ASSERT(false && "not implemented");
                                         };
                                     } break;
                                 default: GGML_ASSERT(false && "not implemented");
                             }
 
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
@@ -2794,9 +2806,9 @@ static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml
 }
 
 static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return ggml_metal_supports_op(op);
+    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
-    UNUSED(backend);
+    return ggml_metal_supports_op(metal_ctx, op);
 }
 
 static struct ggml_backend_i ggml_backend_metal_i = {

From c30b1ef39aeba497a943416d2897d69fee055b96 Mon Sep 17 00:00:00 2001
From: texmex76 <40733439+texmex76@users.noreply.github.com>
Date: Sat, 13 Jan 2024 17:06:20 +0100
Subject: [PATCH 011/138] gguf : fix potential infinite for-loop (#4600)

Co-authored-by: Bernhard Gstrein <gstrein@informatik.uni-freiburg.de>
---
 ggml.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index 6dbd7626c..de6ef34bd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19184,7 +19184,7 @@ void gguf_free(struct gguf_context * ctx) {
 
     if (ctx->kv) {
         // free string memory - not great..
-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
 
             if (kv->key.data) {
@@ -19200,7 +19200,7 @@ void gguf_free(struct gguf_context * ctx) {
             if (kv->type == GGUF_TYPE_ARRAY) {
                 if (kv->value.arr.data) {
                     if (kv->value.arr.type == GGUF_TYPE_STRING) {
-                        for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                             struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
                             if (str->data) {
                                 free(str->data);
@@ -19216,7 +19216,7 @@ void gguf_free(struct gguf_context * ctx) {
     }
 
     if (ctx->infos) {
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];
 
             if (info->name.data) {

From 722d33f34ec74c6f7046109f936d0928ffe171bc Mon Sep 17 00:00:00 2001
From: Yann Follet <131855179+YannFollet@users.noreply.github.com>
Date: Sun, 14 Jan 2024 00:09:08 +0800
Subject: [PATCH 012/138] main : add parameter --no-display-prompt (#4541)

* add the parameter : --no-display-prompt , combine with --log-disable it will display only the generated tokens

* remove empty line

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp      | 6 +++++-
 common/common.h        | 1 +
 examples/main/main.cpp | 7 ++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 322b9f91e..c11006bcb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -617,6 +617,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.numa = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
+        } else if (arg == "--no-display-prompt") {
+            params.display_prompt = false;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -936,11 +938,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
     printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
 #endif
+    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
+    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
     printf("  -gan N, --grp-attn-n N\n");
     printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
     printf("  -gaw N, --grp-attn-w N\n");
     printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
-    printf("  --verbose-prompt      print prompt before generation\n");
     printf("  -dkvc, --dump-kv-cache\n");
     printf("                        verbose print of the KV cache\n");
     printf("  -nkvo, --no-kv-offload\n");
@@ -1582,6 +1585,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
     fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
+    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
 
 //
diff --git a/common/common.h b/common/common.h
index f29be5b5a..096468243 100644
--- a/common/common.h
+++ b/common/common.h
@@ -126,6 +126,7 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
+    bool display_prompt    = true;  // print prompt before generation
     bool infill            = false; // use infill mode
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c53b29978..58b7f807a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -477,6 +477,7 @@ int main(int argc, char ** argv) {
 
     bool is_antiprompt        = false;
     bool input_echo           = true;
+    bool display              = true;
     bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
 
     int n_past             = 0;
@@ -491,6 +492,7 @@ int main(int argc, char ** argv) {
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console::set_display(console::prompt);
+    display = params.display_prompt;
 
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
@@ -707,7 +709,7 @@ int main(int argc, char ** argv) {
         }
 
         // display text
-        if (input_echo) {
+        if (input_echo && display) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 printf("%s", token_str.c_str());
@@ -724,6 +726,7 @@ int main(int argc, char ** argv) {
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
+            display = true;
         }
 
         // if not currently processing queued inputs;
@@ -796,6 +799,7 @@ int main(int argc, char ** argv) {
 
                 // color user input only
                 console::set_display(console::user_input);
+                display = params.display_prompt;
 
                 std::string line;
                 bool another_line = true;
@@ -806,6 +810,7 @@ int main(int argc, char ** argv) {
 
                 // done taking input, reset color
                 console::set_display(console::reset);
+                display = true;
 
                 // Add tokens to embd only if the input buffer is non-empty
                 // Entering a empty line lets the user pass control back

From 6b48ed089377330cdb362970a51c1c89b6d857a8 Mon Sep 17 00:00:00 2001
From: Someone <sergei.kozlukov@aalto.fi>
Date: Sat, 13 Jan 2024 16:29:16 +0000
Subject: [PATCH 013/138] workflows: unbreak nix-build-aarch64, and split it
 out (#4915)

The fix should be just the `sudo apt-get update`
---
 .github/workflows/nix-ci-aarch64.yml | 55 ++++++++++++++++++++++++++++
 .github/workflows/nix-ci.yml         | 41 ---------------------
 2 files changed, 55 insertions(+), 41 deletions(-)
 create mode 100644 .github/workflows/nix-ci-aarch64.yml

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
new file mode 100644
index 000000000..be7c26d40
--- /dev/null
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -0,0 +1,55 @@
+name: Nix aarch64 builds
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+
+jobs:
+  nix-build-aarch64:
+    if: ${{ vars.CACHIX_NAME != '' }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install QEMU
+      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y qemu-user-static qemu-system-aarch64
+        sudo usermod -a -G kvm $USER
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-platforms = aarch64-linux
+          extra-system-features = nixos-test kvm
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: ${{ vars.CACHIX_NAME }}
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.aarch64-linux"
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --systems aarch64-linux
+          --flake
+          ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index a38c6ead4..845b93bfb 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -69,44 +69,3 @@ jobs:
           -- --skip-cached --no-nom
           --flake
           ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"

From df845cc982e7e2ea7b9900e29d55b15338faa78d Mon Sep 17 00:00:00 2001
From: David Friehs <david@friehs.info>
Date: Sat, 13 Jan 2024 17:29:43 +0100
Subject: [PATCH 014/138] llama : minimize size used for state save/load
 (#4820)

* examples : save-load-state: save only required state

* llama : only reserve n_vocab * n_batch at most for logits

llama_decode asserts that only n_batch tokens are passed each call, and
n_ctx is expected to be bigger than n_batch.

* llama : always reserve n_vocab * n_batch for logits

llama_context de-serialization breaks if the contexts have differing
capacity for logits and llama_decode will at maximum resize to
n_vocab * n_batch.

* llama : only save and restore used logits

for batch sizes of 512 this reduces save state in the best case by
around 62 MB, which can be a lot if planning to save on each message
to allow regenerating messages.

* llama : use ostringstream and istringstream for save and load

* llama : serialize rng into minimum amount of space required

* llama : break session version due to serialization changes
---
 examples/save-load-state/save-load-state.cpp | 21 ++++----
 llama.cpp                                    | 53 +++++++-------------
 llama.h                                      |  2 +-
 3 files changed, 29 insertions(+), 47 deletions(-)

diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 48d801110..ef952e2bd 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -45,13 +45,13 @@ int main(int argc, char ** argv) {
     // save state (rng, logits, embedding and kv_cache) to file
     {
         std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+        const size_t written = llama_copy_state_data(ctx, state_mem.data());
 
-        {
-            FILE *fp_write = fopen("dump_state.bin", "wb");
-            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
-            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
-            fclose(fp_write);
-        }
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        fwrite(state_mem.data(), 1, written, fp_write);
+        fclose(fp_write);
+
+        fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
     }
 
     // save state (last tokens)
@@ -100,18 +100,17 @@ int main(int argc, char ** argv) {
         std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
 
         FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
 
-        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
-        if (ret != state_mem.size()) {
+        if (read != llama_set_state_data(ctx2, state_mem.data())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
             return 1;
         }
 
-        llama_set_state_data(ctx2, state_mem.data());
-
-        fclose(fp_read);
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
     }
 
     // restore state (last tokens)
diff --git a/llama.cpp b/llama.cpp
index 1d2eb569f..275456088 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9379,12 +9379,8 @@ struct llama_context * llama_new_context_with_model(
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
 
-        // resized during inference
-        if (params.logits_all) {
-            ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
-        } else {
-            ctx->logits.reserve(hparams.n_vocab);
-        }
+        // resized during inference, reserve maximum
+        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
 
         if (params.embedding){
             ctx->embedding.resize(hparams.n_embd);
@@ -9731,8 +9727,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
     // for reference, std::mt19937(1337) serializes to 6701 bytes.
     const size_t s_rng_size        = sizeof(size_t);
     const size_t s_rng             = LLAMA_MAX_RNG_STATE;
-    const size_t s_logits_capacity = sizeof(size_t);
     const size_t s_logits_size     = sizeof(size_t);
+    // assume worst case for logits although only currently set ones are serialized
     const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
     const size_t s_embedding_size  = sizeof(size_t);
     const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
@@ -9743,7 +9739,6 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
     const size_t s_total = (
         + s_rng_size
         + s_rng
-        + s_logits_capacity
         + s_logits_size
         + s_logits
         + s_embedding_size
@@ -9812,37 +9807,27 @@ struct llama_data_file_context : llama_data_context {
 static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
     // copy rng
     {
-        std::stringstream rng_ss;
+        std::ostringstream rng_ss;
         rng_ss << ctx->rng;
 
-        const size_t rng_size = rng_ss.str().size();
-        char rng_buf[LLAMA_MAX_RNG_STATE];
+        const std::string & rng_str = rng_ss.str();
+        const size_t        rng_size = rng_str.size();
 
-        memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
-        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
 
-        data_ctx->write(&rng_size,   sizeof(rng_size));
-        data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
+        data_ctx->write(&rng_size,      sizeof(rng_size));
+        data_ctx->write(rng_str.data(), rng_size);
     }
 
     // copy logits
     {
-        const size_t logits_cap  = ctx->logits.capacity();
         const size_t logits_size = ctx->logits.size();
 
-        data_ctx->write(&logits_cap,  sizeof(logits_cap));
         data_ctx->write(&logits_size, sizeof(logits_size));
 
         if (logits_size) {
             data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
         }
-
-        // If there is a gap between the size and the capacity, write padding
-        size_t padding_size = (logits_cap - logits_size) * sizeof(float);
-        if (padding_size > 0) {
-            std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
-            data_ctx->write(padding.data(), padding_size);
-        }
     }
 
     // copy embeddings
@@ -9925,13 +9910,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
     // set rng
     {
         size_t rng_size;
-        char   rng_buf[LLAMA_MAX_RNG_STATE];
+        memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
 
-        memcpy(&rng_size,   inp, sizeof(rng_size));    inp += sizeof(rng_size);
-        memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
 
-        std::stringstream rng_ss;
-        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        std::string rng_str((char *)inp, rng_size); inp += rng_size;
+
+        std::istringstream rng_ss(rng_str);
         rng_ss >> ctx->rng;
 
         GGML_ASSERT(!rng_ss.fail());
@@ -9939,20 +9924,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
     // set logits
     {
-        size_t logits_cap;
         size_t logits_size;
 
-        memcpy(&logits_cap,  inp, sizeof(logits_cap));  inp += sizeof(logits_cap);
         memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
 
-        GGML_ASSERT(ctx->logits.capacity() == logits_cap);
+        GGML_ASSERT(ctx->logits.capacity() >= logits_size);
 
         if (logits_size) {
             ctx->logits.resize(logits_size);
-            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
-        }
 
-        inp += logits_cap * sizeof(float);
+            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
+            inp += logits_size * sizeof(float);
+        }
     }
 
     // set embeddings
diff --git a/llama.h b/llama.h
index 689e12d7c..01d6fafaa 100644
--- a/llama.h
+++ b/llama.h
@@ -43,7 +43,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 3
+#define LLAMA_SESSION_VERSION 4
 
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.

From 2d57de525541247132e354f561ff48775fba5d85 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 18:46:37 +0200
Subject: [PATCH 015/138] metal : disable log for loaded kernels (#4794)

---
 ggml-metal.m | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 6c28a7ee3..57e444827 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -398,9 +398,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
             struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
             kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
             kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
-            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
-                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
-                    (int) kernel->pipeline.threadExecutionWidth); \
             if (error) { \
                 GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                 return NULL; \

From f172de03f11465dc6c5a0fc3a22f8ec254c6832c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 18:47:38 +0200
Subject: [PATCH 016/138] llama : fix detokenization of non-special
 added-tokens (#4916)

Co-authored-by: goerch <jhr.walter@t-online.de>
---
 llama.cpp | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 275456088..2190ea7aa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10305,6 +10305,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
     if (0 <= token && token < llama_n_vocab(model)) {
         switch (llama_vocab_get_type(model->vocab)) {
         case LLAMA_VOCAB_TYPE_SPM: {
+            // NOTE: we accept all unsupported token types,
+            // suppressing them like CONTROL tokens.
             if (llama_is_normal_token(model->vocab, token)) {
                 std::string result = model->vocab.id_to_token[token].text;
                 llama_unescape_whitespace(result);
@@ -10313,6 +10315,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                 }
                 memcpy(buf, result.c_str(), result.length());
                 return result.length();
+            } else if (llama_is_user_defined_token(model->vocab, token)) {
+                std::string result = model->vocab.id_to_token[token].text;
+                if (length < (int) result.length()) {
+                    return -result.length();
+                }
+                memcpy(buf, result.c_str(), result.length());
+                return result.length();
             } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
                 if (length < 3) {
                     return -3;
@@ -10327,14 +10336,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                 }
                 buf[0] = llama_token_to_byte(model->vocab, token);
                 return 1;
-            } else {
-                // TODO: for now we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                // GGML_ASSERT(false);
             }
             break;
         }
         case LLAMA_VOCAB_TYPE_BPE: {
+            // NOTE: we accept all unsupported token types,
+            // suppressing them like CONTROL tokens.
             if (llama_is_normal_token(model->vocab, token)) {
                 std::string result = model->vocab.id_to_token[token].text;
                 result = llama_decode_text(result);
@@ -10343,12 +10350,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
                 }
                 memcpy(buf, result.c_str(), result.length());
                 return result.length();
+            } else if (llama_is_user_defined_token(model->vocab, token)) {
+                std::string result = model->vocab.id_to_token[token].text;
+                if (length < (int) result.length()) {
+                    return -result.length();
+                }
+                memcpy(buf, result.c_str(), result.length());
+                return result.length();
             } else if (llama_is_control_token(model->vocab, token)) {
                 ;
-            } else {
-                // TODO: for now we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                // GGML_ASSERT(false);
             }
             break;
         }

From 0ea069b87bd296c556824e57455433b6c0357340 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 19:31:26 +0200
Subject: [PATCH 017/138] server : fix prompt caching with system prompt
 (#4914)

---
 examples/server/server.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 79eacf828..93f999298 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1180,8 +1180,9 @@ struct llama_server_context
         return slot.images.size() > 0;
     }
 
-    void send_error(task_server& task, std::string error)
+    void send_error(task_server& task, const std::string &error)
     {
+        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
         std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
         res.id = task.id;
@@ -1570,12 +1571,22 @@ struct llama_server_context
                         LOG_TEE("slot unavailable\n");
                         // send error result
                         send_error(task, "slot unavailable");
-                        return;
+                        break;
                     }
 
                     if (task.data.contains("system_prompt"))
                     {
+                        if (!all_slots_are_idle) {
+                            send_error(task, "system prompt can only be updated when all slots are idle");
+                            break;
+                        }
                         process_system_prompt_data(task.data["system_prompt"]);
+
+                        // reset cache_tokens for all slots
+                        for (llama_client_slot &slot : slots)
+                        {
+                            slot.cache_tokens.clear();
+                        }
                     }
 
                     slot->reset();
@@ -1652,8 +1663,7 @@ struct llama_server_context
         // attend tasks
         process_tasks();
 
-        // update the system prompt wait until all slots are idle state
-        if (system_need_update && all_slots_are_idle)
+        if (system_need_update)
         {
             LOG_TEE("updating system prompt\n");
             update_system_prompt();

From 4be5ef556de830c5c4f6e45c05ef4427823fe607 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 13 Jan 2024 20:45:45 +0200
Subject: [PATCH 018/138] metal : remove old API (#4919)

ggml-ci
---
 Makefile                      |   9 --
 examples/CMakeLists.txt       |   3 -
 examples/metal/CMakeLists.txt |   4 -
 examples/metal/metal.cpp      | 103 -------------
 ggml-metal.h                  |  55 +------
 ggml-metal.m                  | 276 +++-------------------------------
 llama.cpp                     |   4 +-
 7 files changed, 27 insertions(+), 427 deletions(-)
 delete mode 100644 examples/metal/CMakeLists.txt
 delete mode 100644 examples/metal/metal.cpp

diff --git a/Makefile b/Makefile
index 05fe9a0f6..995b89f7a 100644
--- a/Makefile
+++ b/Makefile
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif
 
-ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
-BUILD_TARGETS += metal
-endif
-
 default: $(BUILD_TARGETS)
 
 test: $(TEST_TARGETS)
@@ -671,11 +667,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-ifdef LLAMA_METAL
-metal: examples/metal/metal.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif
-
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index fa127a3aa..f67d74c55 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,9 +37,6 @@ else()
     add_subdirectory(lookup)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(imatrix)
-    if (LLAMA_METAL)
-        add_subdirectory(metal)
-    endif()
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt
deleted file mode 100644
index f16d49165..000000000
--- a/examples/metal/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-set(TEST_TARGET metal)
-add_executable(${TEST_TARGET} metal.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
deleted file mode 100644
index 16c1146f9..000000000
--- a/examples/metal/metal.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-// Evaluate a statically exported ggml computation graph with Metal
-//
-// - First, export a LLaMA graph:
-//
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
-//
-// - Run this tool to evaluate the exported graph:
-//
-//  $ ./bin/metal llama.ggml
-//
-// The purpose of this tool is mostly for debugging and demonstration purposes.
-// The main limitation of exporting computation graphs is that their sizes are static which often
-// can be a problem for real-world applications.
-//
-
-#include "ggml.h"
-#include "ggml-metal.h"
-
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    if (argc != 2) {
-        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
-        return -1;
-    }
-
-    const char * fname_cgraph = argv[1];
-
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init(1);
-
-    const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
-    const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
-    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
-    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
-
-    // main
-    {
-        struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
-        *(int32_t *) input->data = 1; // BOS
-
-        ggml_metal_set_tensor(ctx_metal, input);
-
-        // warmup
-        ggml_metal_graph_compute(ctx_metal, gf);
-
-        const int n_iter = 16;
-
-        const int64_t t0 = ggml_time_us();
-
-        // the actual inference happens here
-        for (int i = 0; i < n_iter; ++i) {
-            ggml_metal_graph_compute(ctx_metal, gf);
-        }
-
-        const int64_t t1 = ggml_time_us();
-
-        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
-    }
-
-    // debug output
-    {
-        struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
-        ggml_metal_get_tensor(ctx_metal, logits);
-
-        float * ptr = (float *) ggml_get_data(logits);
-
-        printf("logits: ");
-        for (int i = 0; i < 10; i++) {
-            printf("%8.4f ", ptr[i]);
-        }
-        printf("\n");
-        int imax = 0;
-        double sum = 0.0;
-        double vmax = -1e9;
-        for (int i = 0; i < 32000; i++) {
-            sum += (double) ptr[i];
-            if (ptr[i] > vmax) {
-                vmax = ptr[i];
-                imax = i;
-            }
-        }
-        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
-    }
-
-    ggml_metal_free(ctx_metal);
-
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return 0;
-}
-
diff --git a/ggml-metal.h b/ggml-metal.h
index c4b7325da..cd5e2995f 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -36,64 +36,13 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
-//
-// internal API
-// temporary exposed to user-code
-//
-
-struct ggml_metal_context;
-
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
-
-void * ggml_metal_host_malloc(size_t n);
-void   ggml_metal_host_free  (void * data);
-
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
-
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
-//
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// try to find operations that can be run concurrently in the graph
-// you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-
-// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
-int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
-
-// output the concur_list for ggml_alloc
-int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
-
 //
 // backend API
 // user-code should use only these functions
 //
 
+GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
diff --git a/ggml-metal.m b/ggml-metal.m
index 57e444827..cae52c983 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -24,8 +24,6 @@
 
 #define UNUSED(x) (void)(x)
 
-#define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
-
 #define GGML_METAL_MAX_KERNELS 256
 
 struct ggml_metal_buffer {
@@ -182,9 +180,6 @@ struct ggml_metal_context {
 
     struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
 
-    int concur_list[GGML_MAX_CONCUR];
-    int concur_list_len;
-
     bool support_simdgroup_reduction;
     bool support_simdgroup_mm;
 };
@@ -200,7 +195,6 @@ struct ggml_metal_context {
 @implementation GGMLMetalClass
 @end
 
-
 static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
     fprintf(stderr, "%s", msg);
 
@@ -211,11 +205,6 @@ static void ggml_metal_default_log_callback(enum ggml_log_level level, const cha
 ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
 void * ggml_metal_log_user_data = NULL;
 
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_metal_log_callback  = log_callback;
-    ggml_metal_log_user_data = user_data;
-}
-
 GGML_ATTRIBUTE_FORMAT(2, 3)
 static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     if (ggml_metal_log_callback != NULL) {
@@ -238,7 +227,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     }
 }
 
-struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
     id<MTLDevice> device;
@@ -264,7 +264,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
-    ctx->concur_list_len = 0;
 
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
@@ -531,7 +530,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     return ctx;
 }
 
-void ggml_metal_free(struct ggml_metal_context * ctx) {
+static void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
 
     for (int i = 0; i < ctx->n_buffers; ++i) {
@@ -557,33 +556,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     free(ctx);
 }
 
-void * ggml_metal_host_malloc(size_t n) {
-    void * data = NULL;
-    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
-    if (result != 0) {
-        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
-    }
-
-    return data;
-}
-
-void ggml_metal_host_free(void * data) {
-    free(data);
-}
-
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-}
-
-int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    return ctx->concur_list_len;
-}
-
-int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
-    return ctx->concur_list;
-}
-
 // temporarily defined here for compatibility between ggml-backend and the old API
 
 struct ggml_backend_metal_buffer {
@@ -656,209 +628,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
     return nil;
 }
 
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                     const char * name,
-                           void * data,
-                         size_t   size,
-                         size_t   max_size) {
-    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
-        return false;
-    }
-
-    if (data) {
-        // verify that the buffer does not overlap with any of the existing buffers
-        for (int i = 0; i < ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
-
-            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
-                return false;
-            }
-        }
-
-        const size_t size_page = sysconf(_SC_PAGESIZE);
-
-        size_t size_aligned = size;
-        if ((size_aligned % size_page) != 0) {
-            size_aligned += (size_page - (size_aligned % size_page));
-        }
-
-        // the buffer fits into the max buffer size allowed by the device
-        if (size_aligned <= ctx->device.maxBufferLength) {
-            ctx->buffers[ctx->n_buffers].name = name;
-            ctx->buffers[ctx->n_buffers].data = data;
-            ctx->buffers[ctx->n_buffers].size = size;
-
-            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
-                return false;
-            }
-
-            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
-
-            ++ctx->n_buffers;
-        } else {
-            // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
-            // one of the views
-            const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-            const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
-            const size_t size_view = ctx->device.maxBufferLength;
-
-            for (size_t i = 0; i < size; i += size_step) {
-                const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-
-                ctx->buffers[ctx->n_buffers].name = name;
-                ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-                ctx->buffers[ctx->n_buffers].size = size_step_aligned;
-
-                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
-                    return false;
-                }
-
-                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
-                if (i + size_step < size) {
-                    GGML_METAL_LOG_INFO("\n");
-                }
-
-                ++ctx->n_buffers;
-            }
-        }
-
-#if TARGET_OS_OSX
-        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
-                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            GGML_METAL_LOG_INFO("\n");
-        }
-#else
-        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-    }
-
-    return true;
-}
-
-void ggml_metal_set_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    size_t offs;
-    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
-}
-
-void ggml_metal_get_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    size_t offs;
-    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
-}
-
-void ggml_metal_graph_find_concurrency(
-        struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf, bool check_mem) {
-    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
-    int nodes_unused[GGML_MAX_CONCUR];
-
-    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
-    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
-    ctx->concur_list_len = 0;
-
-    int n_left    = gf->n_nodes;
-    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
-    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
-
-    while (n_left > 0) {
-        // number of nodes at a layer (that can be issued concurrently)
-        int concurrency = 0;
-        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
-            if (nodes_unused[i]) {
-                // if the requirements for gf->nodes[i] are satisfied
-                int exe_flag = 1;
-
-                // scan all srcs
-                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
-                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
-                    if (src_cur) {
-                        // if is leaf nodes it's satisfied.
-                        // TODO: ggml_is_leaf()
-                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
-                            continue;
-                        }
-
-                        // otherwise this src should be the output from previous nodes.
-                        int is_found = 0;
-
-                        // scan 2*search_depth back because we inserted barrier.
-                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
-                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
-                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
-                                is_found = 1;
-                                break;
-                            }
-                        }
-                        if (is_found == 0) {
-                            exe_flag = 0;
-                            break;
-                        }
-                    }
-                }
-                if (exe_flag && check_mem) {
-                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
-                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
-                    int64_t data_start = (int64_t) gf->nodes[i]->data;
-                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
-                    for (int j = n_start; j < i; j++) {
-                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
-                                            && gf->nodes[j]->op != GGML_OP_VIEW \
-                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
-                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
-                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
-                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
-                                continue;
-                            }
-
-                            exe_flag = 0;
-                        }
-                    }
-                }
-                if (exe_flag) {
-                    ctx->concur_list[level_pos + concurrency] = i;
-                    nodes_unused[i] = 0;
-                    concurrency++;
-                    ctx->concur_list_len++;
-                }
-            }
-        }
-        n_left -= concurrency;
-        // adding a barrier different layer
-        ctx->concur_list[level_pos + concurrency] = -1;
-        ctx->concur_list_len++;
-        // jump all sorted nodes at nodes_bak
-        while (!nodes_unused[n_start]) {
-            n_start++;
-        }
-        level_pos += concurrency + 1;
-    }
-
-    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
-    }
-}
-
 static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
@@ -940,19 +709,15 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
     }
 }
 
-bool ggml_metal_graph_compute(
+static bool ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
     @autoreleasepool {
 
-    // if there is ctx->concur_list, dispatch concurrently
-    // else fallback to serial dispatch
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
 
-    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
-
-    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
-    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+    const int n_nodes  = gf->n_nodes;
+    edesc.dispatchType = MTLDispatchTypeSerial;
 
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
@@ -983,7 +748,7 @@ bool ggml_metal_graph_compute(
             const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
 
             for (int ind = node_start; ind < node_end; ++ind) {
-                const int i = has_concur ? ctx->concur_list[ind] : ind;
+                const int i = ind;
 
                 if (i == -1) {
                     [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
@@ -2823,6 +2588,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .supports_op             = */ ggml_backend_metal_supports_op,
 };
 
+void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
+    ggml_metal_log_callback  = log_callback;
+    ggml_metal_log_user_data = user_data;
+}
+
 ggml_backend_t ggml_backend_metal_init(void) {
     struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
 
@@ -2849,7 +2619,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
     struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
 
-    ggml_metal_set_n_cb(ctx, n_cb);
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }
 
 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
diff --git a/llama.cpp b/llama.cpp
index 2190ea7aa..66494974a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1266,7 +1266,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 struct llama_state {
     llama_state() {
 #ifdef GGML_USE_METAL
-        ggml_metal_log_set_callback(log_callback, log_callback_user_data);
+        ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
 #endif
     }
 
@@ -10470,7 +10470,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
     g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
     g_state.log_callback_user_data = user_data;
 #ifdef GGML_USE_METAL
-    ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
+    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
 #endif
 }
 

From c71d608ce7a1584bf5072f197919dd24f3a6163f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 13 Jan 2024 21:41:37 +0100
Subject: [PATCH 019/138] ggml: cache sin/cos for RoPE (#4908)

---
 ggml.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/ggml.c b/ggml.c
index de6ef34bd..bcfb6652c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11638,6 +11638,21 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
     return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 }
 
+static void ggml_rope_cache_init(
+     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale
+) {
+    float theta = theta_base;
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        rope_yarn(
+            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta *= theta_scale;
+    }
+}
+
 void ggml_rope_yarn_corr_dims(
     int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
@@ -11720,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
             const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
+                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -11753,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-                        sin_theta *= sin_sign;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
 
                         // zeta scaling for xPos only:
                         float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                         if (xpos_down) zeta = 1.0f / zeta;
 
-                        theta_base *= theta_scale;
-
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
@@ -11888,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
             const int64_t p = pos[i2];
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
+                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -11921,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-                        sin_theta *= sin_sign;
-
-                        theta_base *= theta_scale;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -16722,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
+            case GGML_OP_ROPE:
                 {
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;

From 76484fbfd355df388f71d6edaa98e1692a74de7e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 00:14:46 +0200
Subject: [PATCH 020/138] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index edcdb530a..753d227a7 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-400c07f00508e6f60fb25405444b5669c365b0a9
+1890780da4ea10db88736fcde85f285abf6c64b0

From 807179ec583dcb882f97d9704577c06beb2c5ec9 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 14 Jan 2024 09:44:30 +0200
Subject: [PATCH 021/138] Make Q3_K_S be the same as olf Q3_K_L for
 Mixtral-8x7B (#4906)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 66494974a..8e20e72a2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8489,9 +8489,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         ++qs.i_feed_forward_w2;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
-            if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+            if (qs.model.hparams.n_expert == 8) {
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                    new_type = GGML_TYPE_Q5_K;
+                }
+            } else {
+                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+            }
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
         }

From 147b17ac94a24d524e367cda26a9ff6245689f34 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 14 Jan 2024 09:45:56 +0200
Subject: [PATCH 022/138] 2-bit quantizations (#4897)

* imatrix: load

* imatrix: WIP

* imatrix: Add Q2_K quantization

* imatrix: also guard against Q2_K_S quantization without importance matrix

* imatrix: guard even more against low-bit quantization misuse

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/benchmark/benchmark-matmult.cpp |   4 +-
 examples/quantize/quantize.cpp           | 133 +++-
 ggml-quants.c                            | 950 +++++++++++++++++++++--
 ggml-quants.h                            |  12 +-
 ggml.c                                   |  36 +-
 ggml.h                                   |   9 +-
 llama.cpp                                |  84 +-
 llama.h                                  |   1 +
 tests/test-backend-ops.cpp               |   2 +-
 9 files changed, 1149 insertions(+), 82 deletions(-)

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 434e1d6bd..e89f3de2f 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -194,7 +194,7 @@ int main(int argc, char ** argv)  {
     // Set up a the benchmark matrices
     // printf("Creating new tensor q11 & Running quantize\n");
     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
 
     // Set up a the compute graph
     // printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv)  {
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
 
     // printf("Creating new tensor q32\n");
     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index f878f6911..f4e2175f1 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -5,6 +5,10 @@
 #include <cstring>
 #include <vector>
 #include <string>
+#include <unordered_map>
+#include <fstream>
+#include <cmath>
+#include <algorithm>
 
 struct quant_option {
     std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
+    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
+    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@@ -72,10 +78,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
         if (it.name != "COPY") {
@@ -83,11 +93,93 @@ static void usage(const char * executable) {
         } else {
             printf("          ");
         }
-        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
+        printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
     }
     exit(1);
 }
 
+static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
+    if (!in) {
+        printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
+        return;
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        return;
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len; in.read((char *)&len, sizeof(len));
+        std::vector<char> name_as_vec(len+1);
+        in.read((char *)name_as_vec.data(), len);
+        if (in.fail()) {
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
+            return;
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = imatrix_data[std::move(name)];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char *)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return;
+        }
+        e.resize(nval);
+        in.read((char*)e.data(), nval*sizeof(float));
+        if (in.fail()) {
+            printf("%s: failed reading data for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return;
+        }
+        if (ncall > 0) {
+            for (auto& v : e) v /= ncall;
+        }
+    }
+    printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
+}
+
+static void prepare_imatrix(const std::string& imatrix_file,
+        const std::vector<std::string>& included_weights,
+        const std::vector<std::string>& excluded_weights,
+        std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+    if (!imatrix_file.empty()) {
+        load_imatrix(imatrix_file, imatrix_data);
+    }
+    if (imatrix_data.empty()) {
+        return;
+    }
+    if (!excluded_weights.empty()) {
+        for (auto& name : excluded_weights) {
+            for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
+                auto pos = it->first.find(name);
+                if (pos != std::string::npos) it = imatrix_data.erase(it);
+                else ++it;
+            }
+        }
+    }
+    if (!included_weights.empty()) {
+        std::unordered_map<std::string, std::vector<float>> tmp;
+        for (auto& name : included_weights) {
+            for (auto& e : imatrix_data) {
+                auto pos = e.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp.emplace(std::move(e));
+                }
+            }
+        }
+        imatrix_data = std::move(tmp);
+    }
+    if (!imatrix_data.empty()) {
+        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+    }
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
     llama_model_quantize_params params = llama_model_quantize_default_params();
 
     int arg_idx = 1;
+    std::string imatrix_file;
+    std::vector<std::string> included_weights, excluded_weights;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -104,14 +198,42 @@ int main(int argc, char ** argv) {
             params.allow_requantize = true;
         } else if (strcmp(argv[arg_idx], "--pure") == 0) {
             params.pure = true;
+        } else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
+            if (arg_idx < argc-1) {
+                imatrix_file = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
+            if (arg_idx < argc-1) {
+                included_weights.push_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
+            if (arg_idx < argc-1) {
+                excluded_weights.push_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
         } else {
             usage(argv[0]);
         }
     }
 
     if (argc - arg_idx < 2) {
+        printf("%s: bad arguments\n", argv[0]);
         usage(argv[0]);
     }
+    if (!included_weights.empty() && !excluded_weights.empty()) {
+        usage(argv[0]);
+    }
+
+    std::unordered_map<std::string, std::vector<float>> imatrix_data;
+    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    if (!imatrix_data.empty()) {
+        params.imatrix = &imatrix_data;
+    }
 
     llama_backend_init(false);
 
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
+        fprintf(stderr, "\n===============================================================================================\n");
+        fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "===============================================================================================\n\n\n");
+        return 1;
+    }
+
     print_build_info();
 
     fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
diff --git a/ggml-quants.c b/ggml-quants.c
index 601d155d7..9290d54cf 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,6 +5,8 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
 
 #ifdef __ARM_NEON
 
@@ -1639,6 +1641,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
     return (n/QK_K*sizeof(block_q2_K));
 }
 
+static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights ? weights[0] : x[0]*x[0];
+    float sum_x = sum_w * x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights ? weights[i] : x[i]*x[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) {
+        min = 0;
+    }
+    if (max <= min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff*diff;
+        float w = weights ? weights[i] : x[i]*x[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights ? weights[i] : x[i]*x[i];
+            sum_l  += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff*diff;
+                float w = weights ? weights[i] : x[i]*x[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
+    float max = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    if (!max) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = nmax / max;
+    for (int i = 0; i < n; ++i) {
+        L[i] = nearest_int(iscale * x[i]);
+    }
+    float scale = 1/iscale;
+    float best_mse = 0;
+    for (int i = 0; i < n; ++i) {
+        float diff = x[i] - scale*L[i];
+        float w = quant_weights[i];
+        best_mse += w*diff*diff;
+    }
+    for (int is = -4; is <= 4; ++is) {
+        if (is == 0) continue;
+        float iscale_is = (0.1f*is + nmax)/max;
+        float scale_is = 1/iscale_is;
+        float mse = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale_is*x[i]);
+            l = MIN(nmax, l);
+            float diff = x[i] - scale_is*l;
+            float w = quant_weights[i];
+            mse += w*diff*diff;
+        }
+        if (mse < best_mse) {
+            best_mse = mse;
+            iscale = iscale_is;
+        }
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MIN(nmax, l);
+        L[i] = l;
+        float w = quant_weights[i];
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    for (int itry = 0; itry < 5; ++itry) {
+        int n_changed = 0;
+        for (int i = 0; i < n; ++i) {
+            float w = quant_weights[i];
+            float slx = sumlx - w*x[i]*L[i];
+            float sl2 = suml2 - w*L[i]*L[i];
+            if (slx > 0 && sl2 > 0) {
+                int new_l = nearest_int(x[i] * sl2 / slx);
+                new_l = MIN(nmax, new_l);
+                if (new_l != L[i]) {
+                    slx += w*x[i]*new_l;
+                    sl2 += w*new_l*new_l;
+                    if (slx*slx*suml2 > sumlx*sumlx*sl2) {
+                        L[i] = new_l; sumlx = slx; suml2 = sl2;
+                        ++n_changed;
+                    }
+                }
+            }
+        }
+        if (!n_changed) {
+            break;
+        }
+    }
+    return sumlx / suml2;
+}
+
+static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
+    GGML_ASSERT(quant_weights);
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+    const bool requantize = true;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float mins[QK_K/16];
+    float scales[QK_K/16];
+    float sw[QK_K/16];
+    float weight[QK_K/16];
+    uint8_t Ls[QK_K/16], Lm[QK_K/16];
+
+    for (int i = 0; i < nb; i++) {
+        memset(sw, 0, QK_K/16*sizeof(float));
+        float sumx2 = 0;
+        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
+        float sigma2 = sumx2/QK_K;
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float * restrict qw = quant_weights + QK_K * i + 16*j;
+            for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
+            for (int l = 0; l < 16; ++l) sw[j] += weight[l];
+            scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+        }
+
+        float dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
+        float mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
+        y[i].d    = GGML_FP32_TO_FP16(dm);
+        y[i].dmin = GGML_FP32_TO_FP16(mm);
+        dm        = GGML_FP16_TO_FP32(y[i].d);
+        mm        = GGML_FP16_TO_FP32(y[i].dmin);
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            y[i].scales[j] = Ls[j] | (Lm[j] << 4);
+        }
+
+        if (requantize) {
+            for (int j = 0; j < QK_K/16; ++j) {
+                const float d = dm * (y[i].scales[j] & 0xF);
+                if (!d) continue;
+                const float m = mm * (y[i].scales[j] >> 4);
+                for (int ii = 0; ii < 16; ++ii) {
+                    int l = nearest_int((x[16*j + ii] + m)/d);
+                    l = MAX(0, MIN(3, l));
+                    L[16*j + ii] = l;
+                }
+            }
+        }
+
+#if QK_K == 256
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+#else
+        for (int l = 0; l < 16; ++l) {
+            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
+        }
+#endif
+
+        x += QK_K;
+
+    }
+}
+
+size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int row = 0; row < nrow; ++row) {
+            quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 //========================= 3-bit (de)-quantization
 
 void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
@@ -2584,14 +2821,6 @@ static const uint8_t ksigns_iq2xs[128] = {
 
 static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
 
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2618,33 +2847,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
     }
 }
 
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xxs * restrict y = vy;
-    quantize_row_iq2_xxs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
-        quantize_row_iq2_xxs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xxs));
-}
-
 // ====================== 2.3125 bpw (de)-quantization
 
-void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
 void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2670,23 +2874,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
     }
 }
 
-void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xs * restrict y = vy;
-    quantize_row_iq2_xs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
-        quantize_row_iq2_xs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xs));
-}
-
 //===================================== Q8_K ==============================================
 
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -7730,3 +7917,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
     *s = 0.125f * sumf;
 #endif
 }
+
+// ================================ IQ2 quantization =============================================
+
+typedef struct {
+    uint64_t * grid;
+    int      * map;
+    uint16_t * neighbours;
+} iq2_entry_t;
+
+static iq2_entry_t iq2_data[2] = {
+    {NULL, NULL, NULL},
+    {NULL, NULL, NULL},
+};
+
+static inline int iq2_data_index(int grid_size) {
+    GGML_ASSERT(grid_size == 256 || grid_size == 512);
+    return grid_size == 256 ? 0 : 1;
+}
+
+static int iq2_compare_func(const void * left, const void * right) {
+    const int * l = (const int *)left;
+    const int * r = (const int *)right;
+    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
+}
+
+static void q2xs_init_impl(int grid_size) {
+    const int gindex = iq2_data_index(grid_size);
+    if (iq2_data[gindex].grid) {
+        return;
+    }
+    static const uint16_t kgrid_256[256] = {
+            0,     2,     5,     8,    10,    17,    20,    32,    34,    40,    42,    65,    68,    80,    88,    97,
+          100,   128,   130,   138,   162,   257,   260,   272,   277,   320,   388,   408,   512,   514,   546,   642,
+         1025,  1028,  1040,  1057,  1060,  1088,  1090,  1096,  1120,  1153,  1156,  1168,  1188,  1280,  1282,  1288,
+         1312,  1350,  1385,  1408,  1425,  1545,  1552,  1600,  1668,  1700,  2048,  2053,  2056,  2068,  2088,  2113,
+         2116,  2128,  2130,  2184,  2308,  2368,  2562,  2580,  4097,  4100,  4112,  4129,  4160,  4192,  4228,  4240,
+         4245,  4352,  4360,  4384,  4432,  4442,  4480,  4644,  4677,  5120,  5128,  5152,  5157,  5193,  5248,  5400,
+         5474,  5632,  5654,  6145,  6148,  6160,  6208,  6273,  6400,  6405,  6560,  6737,  8192,  8194,  8202,  8260,
+         8289,  8320,  8322,  8489,  8520,  8704,  8706,  9217,  9220,  9232,  9280,  9302,  9472,  9537,  9572,  9872,
+        10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
+        16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
+        17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
+        20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
+        22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
+        25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
+        33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
+        37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
+    };
+    static const uint16_t kgrid_512[512] = {
+            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
+           73,    80,    82,    85,    88,    97,   100,   128,   130,   133,   136,   145,   148,   153,   160,   257,
+          260,   262,   265,   272,   274,   277,   280,   282,   289,   292,   320,   322,   325,   328,   337,   340,
+          352,   360,   385,   388,   400,   512,   514,   517,   520,   529,   532,   544,   577,   580,   592,   597,
+          640,   650,  1025,  1028,  1030,  1033,  1040,  1042,  1045,  1048,  1057,  1060,  1088,  1090,  1093,  1096,
+         1105,  1108,  1110,  1120,  1153,  1156,  1168,  1280,  1282,  1285,  1288,  1297,  1300,  1312,  1345,  1348,
+         1360,  1377,  1408,  1537,  1540,  1552,  1574,  1600,  1602,  1668,  2048,  2050,  2053,  2056,  2058,  2065,
+         2068,  2080,  2085,  2113,  2116,  2128,  2136,  2176,  2208,  2218,  2305,  2308,  2320,  2368,  2433,  2441,
+         2560,  2592,  2600,  2710,  2720,  4097,  4100,  4102,  4105,  4112,  4114,  4117,  4120,  4129,  4132,  4160,
+         4162,  4165,  4168,  4177,  4180,  4192,  4202,  4225,  4228,  4240,  4352,  4354,  4357,  4360,  4369,  4372,
+         4384,  4417,  4420,  4432,  4480,  4500,  4502,  4609,  4612,  4614,  4624,  4672,  4704,  5120,  5122,  5125,
+         5128,  5137,  5140,  5152,  5185,  5188,  5193,  5200,  5220,  5248,  5377,  5380,  5392,  5440,  5632,  5652,
+         5705,  6145,  6148,  6160,  6162,  6208,  6228,  6278,  6400,  6405,  6502,  6737,  6825,  8192,  8194,  8197,
+         8200,  8202,  8209,  8212,  8224,  8257,  8260,  8272,  8320,  8352,  8449,  8452,  8464,  8512,  8520,  8549,
+         8704,  8738,  8832,  8872,  9217,  9220,  9232,  9257,  9280,  9472,  9537,  9554,  9625,  9729,  9754,  9894,
+        10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
+        16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
+        16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
+        16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
+        17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
+        18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
+        20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
+        21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
+        22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
+        24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
+        32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
+        33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
+        33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
+        35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
+        37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
+        40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
+        42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
+    };
+    const int kmap_size = 43692;
+    const int nwant = 2;
+    const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
+    uint64_t * kgrid_q2xs;
+    int      * kmap_q2xs;
+    uint16_t * kneighbors_q2xs;
+
+    printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+    uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
+    for (int k = 0; k < grid_size; ++k) {
+        int8_t * pos = (int8_t *)(the_grid + k);
+        for (int i = 0; i < 8; ++i) {
+            int l = (kgrid[k] >> 2*i) & 0x3;
+            pos[i] = 2*l + 1;
+        }
+    }
+    kgrid_q2xs = the_grid;
+    iq2_data[gindex].grid = the_grid;
+    kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
+    iq2_data[gindex].map = kmap_q2xs;
+    for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
+    uint64_t aux64;
+    uint8_t * aux8 = (uint8_t *)&aux64;
+    for (int i = 0; i < grid_size; ++i) {
+        aux64 = kgrid_q2xs[i];
+        uint16_t index = 0;
+        for (int k=0; k<8; ++k) {
+            uint16_t q = (aux8[k] - 1)/2;
+            index |= (q << 2*k);
+        }
+        kmap_q2xs[index] = i;
+    }
+    int8_t pos[8];
+    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    int num_neighbors = 0, num_not_in_map = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q2xs[i] >= 0) continue;
+        ++num_not_in_map;
+        for (int k = 0; k < 8; ++k) {
+            int l = (i >> 2*k) & 0x3;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+        int n = 0; int d2 = dist2[0];
+        int nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            ++n;
+        }
+        num_neighbors += n;
+    }
+    printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+    kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
+    iq2_data[gindex].neighbours = kneighbors_q2xs;
+    int counter = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q2xs[i] >= 0) continue;
+        for (int k = 0; k < 8; ++k) {
+            int l = (i >> 2*k) & 0x3;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+        kmap_q2xs[i] = -(counter + 1);
+        int d2 = dist2[0];
+        uint16_t * start = &kneighbors_q2xs[counter++];
+        int n = 0, nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            kneighbors_q2xs[counter++] = dist2[2*j+1];
+            ++n;
+        }
+        *start = n;
+    }
+    free(dist2);
+}
+
+void ggml_init_iq2_quantization(enum ggml_type type) {
+    if (type == GGML_TYPE_IQ2_XXS) {
+        q2xs_init_impl(256);
+    }
+    else if (type == GGML_TYPE_IQ2_XS) {
+        q2xs_init_impl(512);
+    }
+    else {
+        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
+    }
+}
+
+static void q2xs_deinit_impl(int grid_size) {
+    GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
+    const int gindex = iq2_data_index(grid_size);
+    if (iq2_data[gindex].grid) {
+        free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
+        free(iq2_data[gindex].map);        iq2_data[gindex].map  = NULL;
+        free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
+    }
+}
+
+void ggml_deinit_iq2_quantization(enum ggml_type type) {
+    if (type == GGML_TYPE_IQ2_XXS) {
+        q2xs_deinit_impl(256);
+    }
+    else if (type == GGML_TYPE_IQ2_XS) {
+        q2xs_deinit_impl(512);
+    }
+    else {
+        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
+    }
+}
+
+static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_d2 = FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float d2 = 0;
+        for (int i = 0; i < 8; ++i) {
+            float q = pg[i];
+            float diff = scale*q - xval[i];
+            d2 += weight[i]*diff*diff;
+        }
+        if (d2 < best_d2) {
+            best_d2 = d2; grid_index = neighbours[j];
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+
+    const int gindex = iq2_data_index(256);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(quant_weights);
+    GGML_ASSERT(kgrid_q2xs);
+    GGML_ASSERT(kmap_q2xs);
+    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 3;
+
+    const int nbl = n/256;
+
+    block_iq2_xxs * y = vy;
+
+    float scales[QK_K/32];
+    float weight[32];
+    float xval[32];
+    int8_t L[32];
+    int8_t Laux[32];
+    float  waux[32];
+    bool   is_on_grid[4];
+    bool   is_on_grid_aux[4];
+    uint8_t block_signs[4];
+    uint32_t q2[2*(QK_K/32)];
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(q2, 0, QK_K/4);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float * xb = xbl + 32*ib;
+            const float * qw = quant_weights + QK_K*ibl + 32*ib;
+            for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 4; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
+            if (!max) {
+                scales[ib] = 0;
+                memset(L, 0, 32);
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            for (int is = -9; is <= 9; ++is) {
+                float id = (2*kMaxQ-1+is*0.1f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 4; ++k) {
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
+                    int grid_index = kmap_q2xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 32; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  4; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 4; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 2*i);
+                    }
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
+                    }
+                    const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
+                    for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
+                // and correspondingly flip quant signs.
+                scale = -scale;
+                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 4; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
+                    printf("\n");
+                    GGML_ASSERT(false);
+                }
+                q2[2*ib+0] |= (grid_index << 8*k);
+                q2[2*ib+1] |= (block_signs[k] << 7*k);
+            }
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(y[ibl].qs, 0, QK_K/4);
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d);
+        float id = 1/d;
+        float sumqx = 0, sumq2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            q2[2*ib+1] |= ((uint32_t)l << 28);
+            const float * xb = xbl + 32*ib;
+            const float * qw = quant_weights + QK_K*ibl + 32*ib;
+            for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
+            const float db = d * (1 + 2*l);
+            uint32_t u = 0;
+            for (int k = 0; k < 4; ++k) {
+                const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
+                const float * xk = xb + 8*k;
+                const float * wk = weight + 8*k;
+                const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
+                float best_mse = 0; int best_index = aux8[k];
+                for (int j = 0; j < 8; ++j) {
+                    float diff = db * grid[j] * signs[j] - xk[j];
+                    best_mse += wk[j] * diff * diff;
+                }
+                for (int idx = 0; idx < 256; ++idx) {
+                    grid = (const uint8_t *)(kgrid_q2xs + idx);
+                    float mse = 0;
+                    for (int j = 0; j < 8; ++j) {
+                        float diff = db * grid[j] * signs[j] - xk[j];
+                        mse += wk[j] * diff * diff;
+                    }
+                    if (mse < best_mse) {
+                        best_mse = mse; best_index = idx;
+                    }
+                }
+                u |= (best_index << 8*k);
+                grid = (const uint8_t *)(kgrid_q2xs + best_index);
+                //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
+                for (int j = 0; j < 8; ++j) {
+                    float q = db * grid[j] * signs[j];
+                    sumqx += wk[j] * q * xk[j];
+                    sumq2 += wk[j] * q * q;
+                }
+            }
+            q2[2*ib] = u;
+            if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
+        }
+        memcpy(y[ibl].qs, q2, QK_K/4);
+    }
+}
+
+static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+
+    const int gindex = iq2_data_index(512);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(quant_weights);
+    GGML_ASSERT(kmap_q2xs);
+    GGML_ASSERT(kgrid_q2xs);
+    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 3;
+
+    const int nbl = n/256;
+
+    block_iq2_xs * y = vy;
+
+    float scales[QK_K/16];
+    float weight[16];
+    float xval[16];
+    int8_t L[16];
+    int8_t Laux[16];
+    float  waux[16];
+    bool   is_on_grid[2];
+    bool   is_on_grid_aux[2];
+    uint8_t block_signs[2];
+    uint16_t q2[2*(QK_K/16)];
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(q2, 0, QK_K/4);
+        memset(y[ibl].scales, 0, QK_K/32);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            const float * xb = xbl + 16*ib;
+            const float * qw = quant_weights + QK_K*ibl + 16*ib;
+            for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 2; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
+            if (!max) {
+                scales[ib] = 0;
+                memset(L, 0, 16);
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            is_on_grid[0] = is_on_grid[1] = true;
+            for (int is = -9; is <= 9; ++is) {
+                float id = (2*kMaxQ-1+is*0.1f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 2; ++k) {
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
+                    int grid_index = kmap_q2xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 16; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  2; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 2; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 2*i);
+                        L[8*k + i] = l;
+                    }
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                scale = -scale;
+                for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 2; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
+                    printf("\n");
+                    GGML_ASSERT(false);
+                }
+                q2[2*ib+k] = grid_index | (block_signs[k] << 9);
+            }
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(y[ibl].qs, 0, QK_K/4);
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d);
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            if (ib%2 == 0) y[ibl].scales[ib/2] = l;
+            else y[ibl].scales[ib/2] |= (l << 4);
+        }
+        memcpy(y[ibl].qs, q2, QK_K/4);
+
+    }
+}
+
+size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq2_xxs);
+    }
+    return nrow * nblock * sizeof(block_iq2_xxs);
+}
+
+size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq2_xs);
+    }
+    return nrow * nblock * sizeof(block_iq2_xs);
+}
+
diff --git a/ggml-quants.h b/ggml-quants.h
index df5e7ae80..e5d110230 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
 void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
-void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs  * restrict y, int k);
 
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
 
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -246,3 +242,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+//
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+//
+size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
diff --git a/ggml.c b/ggml.c
index bcfb6652c..52467475a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq2_xxs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
-        .from_float               = quantize_row_iq2_xxs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
         .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_iq2_xs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
-        .from_float               = quantize_row_iq2_xs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xs_reference,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
         .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
@@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
+        int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
+    (void)imatrix;
     size_t result = 0;
+    int n = nrows * n_per_row;
     switch (type) {
         case GGML_TYPE_Q4_0:
             {
@@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
         case GGML_TYPE_Q2_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_q2_K * block = (block_q2_K*)dst + start / QK_K;
-                result = ggml_quantize_q2_K(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q3_K:
             {
@@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
         case GGML_TYPE_IQ2_XXS:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                GGML_ASSERT(imatrix);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_IQ2_XS:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                GGML_ASSERT(imatrix);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_F16:
             {
diff --git a/ggml.h b/ggml.h
index b18ba7812..1187074f7 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2067,10 +2067,13 @@ extern "C" {
     GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
 
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
+            int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+    // These are needed for IQ2_XS and IQ2_XXS quantizations
+    GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
+    GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
 
     //
     // Importance matrix
diff --git a/llama.cpp b/llama.cpp
index 8e20e72a2..107b05114 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8429,9 +8429,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
             new_type = GGML_TYPE_Q8_0;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+            new_type = GGML_TYPE_Q5_K;
+        }
         else if (new_type != GGML_TYPE_Q8_0) {
             new_type = GGML_TYPE_Q6_K;
         }
+    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+        if (name.find("attn_v.weight") != std::string::npos) {
+            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+            else new_type = GGML_TYPE_Q2_K;
+            ++qs.i_attention_wv;
+        }
+        else if (name.find("ffn_down") != std::string::npos) {
+            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
+            ++qs.i_feed_forward_w2;
+        }
+        else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8601,6 +8615,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     if (params->only_copy) {
         ftype = model.ftype;
     }
+    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
+    if (params->imatrix) {
+        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+        if (imatrix_data) {
+            printf("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+        }
+    }
 
     const size_t align = GGUF_DEFAULT_ALIGNMENT;
     struct gguf_context * ctx_out = gguf_init_empty();
@@ -8658,6 +8679,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     ::zeros(fout, meta_size);
 
+    std::set<ggml_type> used_iq2;
+
     for (int i = 0; i < ml.n_tensors; ++i) {
         struct ggml_tensor * tensor = ml.get_tensor_meta(i);
 
@@ -8710,6 +8733,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             const size_t nelements = ggml_nelements(tensor);
 
+            if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
+                ggml_init_iq2_quantization(new_type);
+                used_iq2.insert(new_type);
+            }
+
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it = imatrix_data->find(tensor->name);
+                if (it == imatrix_data->end()) {
+                    printf("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                } else {
+                    if (it->second.size() == (size_t)tensor->ne[0]) {
+                        imatrix = it->second.data();
+                    } else {
+                        printf("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+                                int(it->second.size()), int(tensor->ne[0]), tensor->name);
+                    }
+                }
+            }
+            if ((new_type == GGML_TYPE_IQ2_XXS ||
+                 new_type == GGML_TYPE_IQ2_XS  ||
+                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                fprintf(stderr, "\n\n============================================================\n");
+                fprintf(stderr, "Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                fprintf(stderr, "The result will be garbage, so bailing out\n");
+                fprintf(stderr, "============================================================\n\n");
+                throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+            }
+
             float * f32_data;
 
             if (tensor->type == GGML_TYPE_F32) {
@@ -8730,21 +8782,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_data = work.data();
             std::array<int64_t, 1 << 4> hist_cur = {};
 
-            static const int chunk_size = 32 * 512;
+            const int n_per_row = tensor->ne[0];
+            const int nrows = nelements / n_per_row;
+
+            static const int min_chunk_size = 32 * 512;
+            const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
             if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
+                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
             } else {
-                size_t counter = 0;
+                int counter = 0;
                 new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
+                     nrows, n_per_row, imatrix]() {
                     std::array<int64_t, 1 << 4> local_hist = {};
+                    const int nrows_per_chunk = chunk_size / n_per_row;
                     size_t local_size = 0;
                     while (true) {
                         std::unique_lock<std::mutex> lock(mutex);
-                        size_t first = counter; counter += chunk_size;
-                        if (first >= nelements) {
+                        int first_row = counter; counter += nrows_per_chunk;
+                        if (first_row >= nrows) {
                             if (local_size > 0) {
                                 for (int j=0; j<int(local_hist.size()); ++j) {
                                     hist_cur[j] += local_hist[j];
@@ -8754,8 +8813,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                             break;
                         }
                         lock.unlock();
-                        size_t last = std::min(nelements, first + chunk_size);
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+                        const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
+                                first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
                     }
                 };
                 for (int it = 0; it < nthread_use - 1; ++it) {
@@ -8766,7 +8826,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 workers.clear();
             }
 
-            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
             int64_t tot_count = 0;
             for (size_t i = 0; i < hist_cur.size(); i++) {
                 hist_all[i] += hist_cur[i];
@@ -8774,6 +8834,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             }
 
             if (tot_count > 0) {
+                LLAMA_LOG_INFO(" | hist: ");
                 for (size_t i = 0; i < hist_cur.size(); i++) {
                     LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
                 }
@@ -8802,6 +8863,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     fout.close();
 
+    for (auto type : used_iq2) {
+        ggml_deinit_iq2_quantization(type);
+    }
+
     gguf_free(ctx_out);
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9166,6 +9231,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.quantize_output_tensor      =*/ true,
         /*.only_copy                   =*/ false,
         /*.pure                        =*/ false,
+        /*.imatrix                     =*/ nullptr,
     };
 
     return result;
diff --git a/llama.h b/llama.h
index 01d6fafaa..79c8335b6 100644
--- a/llama.h
+++ b/llama.h
@@ -249,6 +249,7 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+        void * imatrix;              // pointer to importance matrix data
     } llama_model_quantize_params;
 
     // grammar types
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d9b8b106a..22a7856d4 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -56,7 +56,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
         std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
         int64_t hist[16];
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.

From ac32902a87147f78d63c931aa8a23dee762660e7 Mon Sep 17 00:00:00 2001
From: Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
Date: Sun, 14 Jan 2024 00:41:44 -0800
Subject: [PATCH 023/138] llama : support WinXP build with MinGW 8.1.0 (#3419)

---
 CMakeLists.txt | 8 ++++++--
 llama.cpp      | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 668669c6d..2741568ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)  # for add_link_options
+cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -76,6 +76,10 @@ if (NOT MSVC)
     option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
 endif()
 
+if (WIN32)
+    option(LLAMA_WIN_VER                     "llama: Windows Version"                           0x602)
+endif()
+
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
@@ -686,7 +690,7 @@ endif()
 
 if (MINGW)
     # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=0x602)
+    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
 endif()
 
 #
diff --git a/llama.cpp b/llama.cpp
index 107b05114..51e9bdaed 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -987,6 +987,7 @@ struct llama_mmap {
         }
 
         if (prefetch > 0) {
+#if _WIN32_WINNT >= 0x602
             // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
             BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
             HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@@ -1004,6 +1005,9 @@ struct llama_mmap {
                             llama_format_win_err(GetLastError()).c_str());
                 }
             }
+#else
+            throw std::runtime_error("PrefetchVirtualMemory unavailable");
+#endif
         }
     }
 

From 5f5fe1bd608fa2ed42af97b5f2ea31be6625fc48 Mon Sep 17 00:00:00 2001
From: Alex Azarov <alex@azarov.by>
Date: Sun, 14 Jan 2024 09:44:39 +0100
Subject: [PATCH 024/138] metal : correctly set SIMD support flags on iOS
 (#4923)

* Correctly set support_simdgroup_reduction and support_simdgroup_mm on iPhone/iPad

* log a little bit more info on iOS
---
 ggml-metal.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index cae52c983..2ca726055 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -330,7 +330,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         }
     }
 
-#if TARGET_OS_OSX
     // print MTL GPU family:
     GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
 
@@ -370,6 +369,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+#if TARGET_OS_OSX
     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
     if (ctx->device.maxTransferRate != 0) {
         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);

From a128c38de862431f1aae9ccc40b792fbc1b8b682 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 14 Jan 2024 10:53:39 +0200
Subject: [PATCH 025/138] Fix ffn_down quantization mix for MoE models (#4927)

* Fix ffn_down quantization mix for MoE models

In #4872 I did not consider the part where every third
tensor is quantized with more bits. Fir MoE this leads to tensors
of the same layer being quantized with different number of bits,
which is not considered as a possibility in the inference implementation
(it is assumed all experts use the same quantization).

* Fix the fix

* Review suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 51e9bdaed..b1d6015e2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8480,13 +8480,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             new_type = GGML_TYPE_Q8_0;
         }
     } else if (name.find("ffn_down") != std::string::npos) {
+        const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+        int i_layer, n_layer;
+        if (n_expert == 1) {
+            i_layer = qs.i_feed_forward_w2;
+            n_layer = qs.n_feed_forward_w2;
+        } else {
+            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+            // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
+            // for getting the current layer as I initially thought, and we need to resort to parsing the
+            // tensor name.
+            n_layer = qs.n_feed_forward_w2 / n_expert;
+            if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
+                throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
+            }
+            if (i_layer < 0 || i_layer >= n_layer) {
+                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
+            }
+        }
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
-            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
+            if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
-                     : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
+            new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
+                     : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
                      : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -8494,14 +8512,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
             if (arch == LLM_ARCH_FALCON) {
-                new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
-                           use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+                new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
+                           use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
             } else {
-                if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+                if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
             new_type = GGML_TYPE_Q5_K;
         }
         ++qs.i_feed_forward_w2;

From 03c526749041c863b0cd842b26b8907e1ea0e0b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 11:03:19 +0200
Subject: [PATCH 026/138] llama : use LLAMA_LOG_ macros for logging

---
 llama.cpp | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b1d6015e2..51821965e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1114,7 +1114,7 @@ struct llama_mlock {
             suggest = false;
         }
 
-        fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                 size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
         return false;
     }
@@ -1123,7 +1123,7 @@ struct llama_mlock {
 
     static void raw_unlock(void * addr, size_t size) {
         if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+            LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
         }
     }
 #elif defined(_WIN32)
@@ -1141,7 +1141,7 @@ struct llama_mlock {
                 return true;
             }
             if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
                     len, size, llama_format_win_err(GetLastError()).c_str());
                 return false;
             }
@@ -1150,7 +1150,7 @@ struct llama_mlock {
             // set size and try again.
             SIZE_T min_ws_size, max_ws_size;
             if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
                         llama_format_win_err(GetLastError()).c_str());
                 return false;
             }
@@ -1163,7 +1163,7 @@ struct llama_mlock {
             min_ws_size += increment;
             max_ws_size += increment;
             if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
                         llama_format_win_err(GetLastError()).c_str());
                 return false;
             }
@@ -1172,7 +1172,7 @@ struct llama_mlock {
 
     static void raw_unlock(void * ptr, size_t len) {
         if (!VirtualUnlock(ptr, len)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+            LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
                     llama_format_win_err(GetLastError()).c_str());
         }
     }
@@ -1184,7 +1184,7 @@ struct llama_mlock {
     }
 
     bool raw_lock(const void * addr, size_t len) const {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
+        LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
         return false;
     }
 
@@ -2085,13 +2085,13 @@ namespace GGUFMeta {
                     __func__, override_type_to_str(override->tag), override->key);
                 switch (override->tag) {
                     case LLAMA_KV_OVERRIDE_BOOL:  {
-                        printf("%s\n", override->bool_value ? "true" : "false");
+                        LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
                     } break;
                     case LLAMA_KV_OVERRIDE_INT:   {
-                        printf("%" PRId64 "\n", override->int_value);
+                        LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
                     } break;
                     case LLAMA_KV_OVERRIDE_FLOAT: {
-                        printf("%.6f\n", override->float_value);
+                        LLAMA_LOG_INFO("%.6f\n", override->float_value);
                     } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
@@ -6993,7 +6993,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                     if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
 
 #ifdef PRETOKENIZERDEBUG
-                    fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                     auto source = std::distance(buffer.begin(), it);
 
@@ -7006,7 +7006,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
 
 #ifdef PRETOKENIZERDEBUG
-                        fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
 #endif
                         it++;
                     }
@@ -7022,7 +7022,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
 
 #ifdef PRETOKENIZERDEBUG
-                        fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
 #endif
 
                         it++;
@@ -7038,7 +7038,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                         raw_text_base_length = right_reminder_length;
 
 #ifdef PRETOKENIZERDEBUG
-                        fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
                     } else {
                         if (source == 0) {
@@ -7095,7 +7095,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         }
 
 #ifdef PRETOKENIZERDEBUG
-                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         llm_tokenizer_spm tokenizer(vocab);
                         llama_escape_whitespace(raw_text);
@@ -7116,7 +7116,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
-                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         llm_tokenizer_bpe tokenizer(vocab);
                         tokenizer.tokenize(raw_text, output);
@@ -8641,7 +8641,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     if (params->imatrix) {
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
-            printf("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
         }
     }
 
@@ -8764,12 +8764,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
                 if (it == imatrix_data->end()) {
-                    printf("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]) {
                         imatrix = it->second.data();
                     } else {
-                        printf("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+                        LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
                                 int(it->second.size()), int(tensor->ne[0]), tensor->name);
                     }
                 }
@@ -8777,10 +8777,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if ((new_type == GGML_TYPE_IQ2_XXS ||
                  new_type == GGML_TYPE_IQ2_XS  ||
                 (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
-                fprintf(stderr, "\n\n============================================================\n");
-                fprintf(stderr, "Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
-                fprintf(stderr, "The result will be garbage, so bailing out\n");
-                fprintf(stderr, "============================================================\n\n");
+                LLAMA_LOG_ERROR("\n\n============================================================\n");
+                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+                LLAMA_LOG_ERROR("============================================================\n\n");
                 throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
             }
 

From 9408cfdad6b1c090a7e1419d4434edc260b7e47e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 11:08:09 +0200
Subject: [PATCH 027/138] scripts : sync-ggml-am.sh option to skip commits

---
 scripts/sync-ggml-am.sh | 14 +++++++++++++-
 scripts/sync-ggml.last  |  2 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 248cf1023..6b2514a11 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -5,7 +5,7 @@
 # Usage:
 #
 #   $ cd /path/to/llama.cpp
-#   $ ./scripts/sync-ggml-am.sh
+#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
 #
 
 set -e
@@ -24,6 +24,11 @@ fi
 lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
 echo "Syncing ggml changes since commit $lc"
 
+to_skip=""
+if [ "$1" == "-skip" ]; then
+    to_skip=$2
+fi
+
 cd $SRC_GGML
 
 git log --oneline $lc..HEAD
@@ -40,6 +45,13 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
 fi
 
 while read c; do
+    if [ -n "$to_skip" ]; then
+        if [[ $to_skip == *"$c"* ]]; then
+            echo "Skipping $c"
+            continue
+        fi
+    fi
+
     git format-patch -k $c~1..$c --stdout -- \
         include/ggml/ggml*.h \
         src/ggml*.h \
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 753d227a7..be9e408fb 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-1890780da4ea10db88736fcde85f285abf6c64b0
+b306d6e996ec0ace77118fa5098822cdc7f9c88f

From bb0c1392479398f9aba86d9ec98db0b95ede6e6d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 13:26:53 +0200
Subject: [PATCH 028/138] llama : check LLAMA_TRACE env for extra logging
 (#4929)

* llama : minor fix indent

* llama : check LLAMA_TRACE env for extra logging

ggml-ci
---
 llama.cpp | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 51821965e..63f37ecdb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2190,6 +2190,11 @@ struct llama_model_loader {
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
 
     llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
+        int trace = 0;
+        if (getenv("LLAMA_TRACE")) {
+            trace = atoi(getenv("LLAMA_TRACE"));
+        }
+
         struct gguf_init_params params = {
             /*.no_alloc = */ true,
             /*.ctx      = */ &ctx_meta,
@@ -2242,11 +2247,10 @@ struct llama_model_loader {
                     type_max   = type;
                 }
 
-                // TODO: make runtime configurable
-#if 0
-                struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-                LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
-#endif
+                if (trace > 0) {
+                    struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
+                }
             }
 
             switch (type_max) {
@@ -6451,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
     static const char * hex = "0123456789ABCDEF";
     switch (llama_vocab_get_type(vocab)) {
-    case LLAMA_VOCAB_TYPE_SPM: {
-        const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-        return vocab.token_to_id.at(buf);
-    }
-    case LLAMA_VOCAB_TYPE_BPE: {
-        return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
-    }
-    default:
-        GGML_ASSERT(false);
+        case LLAMA_VOCAB_TYPE_SPM: {
+            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+            return vocab.token_to_id.at(buf);
+        }
+        case LLAMA_VOCAB_TYPE_BPE: {
+            return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
+        }
+        default:
+            GGML_ASSERT(false);
     }
 }
 

From 467a882fd2e5b6172897b49aa45aa29bd3f27685 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 14 Jan 2024 16:21:12 +0200
Subject: [PATCH 029/138] Add ability to use importance matrix for all k-quants
 (#4930)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/quantize/quantize.cpp |   2 +-
 ggml-quants.c                  | 443 ++++++++++++++++++++++++++++++++-
 ggml-quants.h                  |   5 +-
 ggml.c                         |  28 ++-
 4 files changed, 462 insertions(+), 16 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index f4e2175f1..2ae046933 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -82,7 +82,7 @@ static void usage(const char * executable) {
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
-    printf("  --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
diff --git a/ggml-quants.c b/ggml-quants.c
index 9290d54cf..0750fe1bb 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1244,7 +1244,8 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
+static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
+        const float * restrict qw) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1270,14 +1271,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
         rmse_type = -rmse_type;
         return_early = true;
     }
-    int weight_type = rmse_type%2;
     float sumlx = 0;
     float suml2 = 0;
     for (int i = 0; i < n; ++i) {
         int l = nearest_int(iscale * x[i]);
         l = MAX(-nmax, MIN(nmax-1, l));
         L[i] = l + nmax;
-        float w = weight_type == 1 ? x[i] * x[i] : 1;
+        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
         sumlx += w*x[i]*l;
         suml2 += w*l*l;
     }
@@ -1293,7 +1293,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
         for (int i = 0; i < n; ++i) {
             int l = nearest_int(iscale * x[i]);
             l = MAX(-nmax, MIN(nmax-1, l));
-            float w = weight_type == 1 ? x[i] * x[i] : 1;
+            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
             sumlx += w*x[i]*l;
             suml2 += w*l*l;
         }
@@ -2089,6 +2089,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
     return (n/QK_K*sizeof(block_q3_K));
 }
 
+static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
+#if QK_K != 256
+    (void)quant_weights;
+    quantize_row_q3_K_reference(x, y, n_per_row);
+#else
+    assert(n_per_row % QK_K == 0);
+    const int nb = n_per_row / QK_K;
+
+    int8_t L[QK_K];
+    float scales[QK_K / 16];
+    float weight[16];
+    float sw[QK_K / 16];
+    int8_t Ls[QK_K / 16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sumx2 = 0;
+        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
+                for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
+            } else {
+                for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
+            }
+            float sumw = 0;
+            for (int l = 0; l < 16; ++l) sumw += weight[l];
+            sw[j] = sumw;
+
+            scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
+
+        }
+
+        memset(y[i].scales, 0, 12);
+
+        float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
+        for (int j = 0; j < QK_K/16; ++j) {
+            int l = Ls[j];
+            if (j < 8) {
+                y[i].scales[j] = l & 0xF;
+            } else {
+                y[i].scales[j-8] |= ((l & 0xF) << 4);
+            }
+            l >>= 4;
+            y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
+        }
+        y[i].d = GGML_FP32_TO_FP16(d_block);
+
+        int8_t sc;
+        for (int j = 0; j < QK_K/16; ++j) {
+            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
+                L[16*j + ii] = l + 4;
+            }
+        }
+
+        memset(y[i].hmask, 0, QK_K/8);
+        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
+        int m = 0;
+        uint8_t hm = 1;
+        for (int j = 0; j < QK_K; ++j) {
+            if (L[j] > 3) {
+                y[i].hmask[m] |= hm;
+                L[j] -= 4;
+            }
+            if (++m == QK_K/8) {
+                m = 0; hm <<= 1;
+            }
+        }
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+
+        x += QK_K;
+    }
+#endif
+}
+
+size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int row = 0; row < nrow; ++row) {
+            quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 // ====================== 4-bit (de)-quantization
 
 void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
@@ -2254,6 +2360,108 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
     return (n/QK_K*sizeof(block_q4_K));
 }
 
+static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
+#if QK_K != 256
+    (void)quant_weights;
+    quantize_row_q4_K_reference(x, y, n_per_row);
+#else
+    assert(n_per_row % QK_K == 0);
+    const int nb = n_per_row / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    float   weights[32];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sum_x2 = 0;
+        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
+        float sigma2 = sum_x2/QK_K;
+        float av_x = sqrtf(sigma2);
+
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 32*j;
+                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
+            } else {
+                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            }
+            scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+          //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(15, l));
+                L[32*j + ii] = l;
+            }
+        }
+        uint8_t * q = y[i].qs;
+        for (int j = 0; j < QK_K; j += 64) {
+            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
+            q += 32;
+        }
+
+        x += QK_K;
+
+    }
+#endif
+}
+
+size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int row = 0; row < nrow; ++row) {
+            quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 // ====================== 5-bit (de)-quantization
 
 void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
@@ -2349,7 +2557,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 #else
         float max_scale = 0, amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
+            scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
             float abs_scale = fabsf(scales[j]);
             if (abs_scale > amax) {
                 amax = abs_scale;
@@ -2460,6 +2668,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
     return (n/QK_K*sizeof(block_q5_K));
 }
 
+static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
+#if QK_K != 256
+    (void)quant_weights;
+    quantize_row_q5_K_reference(x, y, n_per_row);
+#else
+    assert(n_per_row % QK_K == 0);
+    const int nb = n_per_row / QK_K;
+
+    uint8_t L[QK_K];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+    float weights[32];
+    uint8_t Laux[32];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sum_x2 = 0;
+        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
+        float sigma2 = sum_x2/QK_K;
+        float av_x = sqrtf(sigma2);
+
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 32*j;
+                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
+            } else {
+                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            }
+            scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(31, l));
+                L[32*j + ii] = l;
+            }
+        }
+
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
+
+        uint8_t m1 = 1, m2 = 2;
+        for (int n = 0; n < QK_K; n += 64) {
+            for (int j = 0; j < 32; ++j) {
+                int l1 = L[n + j];
+                if (l1 > 15) {
+                    l1 -= 16; qh[j] |= m1;
+                }
+                int l2 = L[n + j + 32];
+                if (l2 > 15) {
+                    l2 -= 16; qh[j] |= m2;
+                }
+                ql[j] = l1 | (l2 << 4);
+            }
+            m1 <<= 2; m2 <<= 2;
+            ql += 32;
+        }
+
+        x += QK_K;
+
+    }
+#endif
+}
+
+size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int row = 0; row < nrow; ++row) {
+            quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 // ====================== 6-bit (de)-quantization
 
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
@@ -2476,7 +2801,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
 
         for (int ib = 0; ib < QK_K/16; ++ib) {
 
-            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
+            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
             scales[ib] = scale;
 
             const float abs_scale = fabsf(scale);
@@ -2608,6 +2933,112 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK_K*sizeof(block_q6_K));
 }
 
+static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
+#if QK_K != 256
+    (void)quant_weights;
+    quantize_row_q6_K_reference(x, y, n_per_row);
+#else
+    assert(n_per_row % QK_K == 0);
+    const int nb = n_per_row / QK_K;
+
+    int8_t L[QK_K];
+    float   scales[QK_K/16];
+    //float   weights[16];
+
+    for (int i = 0; i < nb; i++) {
+
+        //float sum_x2 = 0;
+        //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
+        //float sigma2 = sum_x2/QK_K;
+
+        float max_scale = 0;
+        float max_abs_scale = 0;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+
+            float scale;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 16*ib;
+                //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
+                //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
+                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
+            } else {
+                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
+            }
+            scales[ib] = scale;
+
+            const float abs_scale = fabsf(scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scale;
+            }
+
+        }
+
+        if (!max_abs_scale) {
+            memset(&y[i], 0, sizeof(block_q6_K));
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+            x += QK_K;
+            continue;
+        }
+
+        float iscale = -128.f/max_scale;
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
+        }
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
+                L[16*j + ii] = l + 32;
+            }
+        }
+
+        uint8_t * restrict ql = y[i].ql;
+        uint8_t * restrict qh = y[i].qh;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                const uint8_t q1 = L[j + l +  0] & 0xF;
+                const uint8_t q2 = L[j + l + 32] & 0xF;
+                const uint8_t q3 = L[j + l + 64] & 0xF;
+                const uint8_t q4 = L[j + l + 96] & 0xF;
+                ql[l+ 0] = q1 | (q3 << 4);
+                ql[l+32] = q2 | (q4 << 4);
+                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+            }
+            ql += 64;
+            qh += 32;
+        }
+
+        x += QK_K;
+
+    }
+#endif
+}
+
+size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int row = 0; row < nrow; ++row) {
+            quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 static const  uint64_t iq2xxs_grid[256] = {
diff --git a/ggml-quants.h b/ggml-quants.h
index e5d110230..99467936a 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -249,4 +249,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict
 size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-
+size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
diff --git a/ggml.c b/ggml.c
index 52467475a..ef5888ab2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18713,26 +18713,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
         case GGML_TYPE_Q3_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_q3_K * block = (block_q3_K*)dst + start / QK_K;
-                result = ggml_quantize_q3_K(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q4_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_q4_K * block = (block_q4_K*)dst + start / QK_K;
-                result = ggml_quantize_q4_K(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q5_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_q5_K * block = (block_q5_K*)dst + start / QK_K;
-                result = ggml_quantize_q5_K(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q6_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
-                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
-                result = ggml_quantize_q6_K(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_IQ2_XXS:
             {

From a836c8f534ab789b02da149fbdaf7735500bff74 Mon Sep 17 00:00:00 2001
From: David Pflug <david@pflug.email>
Date: Sun, 14 Jan 2024 10:46:00 -0500
Subject: [PATCH 030/138] llama : fix missing quotes (#4937)

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 63f37ecdb..7af38718c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7099,7 +7099,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         }
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         llm_tokenizer_spm tokenizer(vocab);
                         llama_escape_whitespace(raw_text);
@@ -7120,7 +7120,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         llm_tokenizer_bpe tokenizer(vocab);
                         tokenizer.tokenize(raw_text, output);

From 4a3156de2fac9a8ee4279de7804d4e352dcfe121 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 15 Jan 2024 07:48:06 +0200
Subject: [PATCH 031/138] CUDA: faster dequantize kernels for Q4_0 and Q4_1
 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml-cuda.cu | 77 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index bd3814c72..a870718a7 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
 #endif // GGML_CUDA_F16
 }
 
+template<typename dst_t>
+static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int i = blockIdx.x;
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
+    const float d = __half2float(x->d);
+    const float dm = -8*d;
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d * (q[l] & 0xF) + dm;
+        y[l+16] = d * (q[l] >>  4) + dm;
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int i = blockIdx.x;
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
+    const float2 d = __half22float2(x->dm);
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
+        y[l+16] = d.x * (q[l] >>  4) + d.y;
+    }
+}
+
 //================================== k-quants
 
 template<typename dst_t>
@@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
 #endif
 }
 
+template<typename dst_t>
+static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
+template<typename dst_t>
+static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
 template<typename dst_t>
 static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb = k / QK_K;
@@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
     int id;
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
@@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:

From 2faaef39799c97a53bec3898141478700da25757 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 15 Jan 2024 10:09:38 +0200
Subject: [PATCH 032/138] llama : check for 256 divisibility for IQ2_XS,
 IQ2_XXS (#4950)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 7af38718c..f9718060d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8559,7 +8559,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
     //}
     bool convert_incompatible_tensor = false;
     if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
-        new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
+        new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
+        new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
@@ -8571,6 +8572,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
     }
     if (convert_incompatible_tensor) {
         switch (new_type) {
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
             case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
             case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
             case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;

From ddb008d845cd50bb090bf051f570130524042936 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Jan 2024 13:27:00 +0200
Subject: [PATCH 033/138] cuda : fix dequantize kernel names (#4938)

---
 ggml-cuda.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index a870718a7..c3e14bc96 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6309,14 +6309,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
 }
 
 template<typename dst_t>
-static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb32 = k / 32;
     const int nb = (k + 255) / 256;
     dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
 }
 
 template<typename dst_t>
-static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb32 = k / 32;
     const int nb = (k + 255) / 256;
     dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
@@ -6370,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
     int id;
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_q4_0_cuda;
+            return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_q4_1_cuda;
+            return dequantize_row_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
@@ -6407,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_q4_0_cuda;
+            return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_q4_1_cuda;
+            return dequantize_row_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:

From d9aa4ffa6e0296d42f1f676dd85de97c8491eb73 Mon Sep 17 00:00:00 2001
From: "Victor Z. Peng" <ziliangdotme@gmail.com>
Date: Mon, 15 Jan 2024 04:41:46 -0800
Subject: [PATCH 034/138] awq-py : fix typo in awq-py/README.md (#4947)

---
 awq-py/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/awq-py/README.md b/awq-py/README.md
index 59354f4e3..16e68d027 100644
--- a/awq-py/README.md
+++ b/awq-py/README.md
@@ -43,7 +43,7 @@ Example for llama model
 # For llama7b and llama2 models
 python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
 # For mistral and mpt models
-python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
+python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
 ```
 
 ## Quantize

From 4483396751c79dea540808b9cb9238245d06da2b Mon Sep 17 00:00:00 2001
From: David Friehs <david@friehs.info>
Date: Mon, 15 Jan 2024 14:06:52 +0100
Subject: [PATCH 035/138] llama : apply classifier-free guidance to logits
 directly (#4951)

---
 common/sampling.cpp |  9 ++++---
 llama.cpp           | 60 ++++++++++++++++++++++++++++++---------------
 llama.h             | 17 +++++++++----
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 8e45909f1..dd1ffeb1b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl(
         logits[it->first] += it->second;
     }
 
+    if (ctx_cfg) {
+        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
+        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+    }
+
     cur.clear();
 
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl(
 
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
-    if (ctx_cfg) {
-        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
-    }
-
     // apply penalties
     const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
     const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
diff --git a/llama.cpp b/llama.cpp
index f9718060d..46c4d11c8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7898,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
     }
 }
 
+void llama_sample_apply_guidance(
+          struct llama_context * ctx,
+                         float * logits,
+                         float * logits_guidance,
+                         float   scale) {
+    GGML_ASSERT(ctx);
+
+    const auto t_start_sample_us = ggml_time_us();
+    const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+    llama_log_softmax(logits, n_vocab);
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+              auto & l = logits[i];
+        const auto & g = logits_guidance[i];
+
+        l = scale * (l - g) + g;
+    }
+
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+}
+
 void llama_sample_classifier_free_guidance(
           struct llama_context * ctx,
         llama_token_data_array * candidates,
           struct llama_context * guidance_ctx,
                          float   scale) {
-    int64_t t_start_sample_us = ggml_time_us();
-
     GGML_ASSERT(ctx);
+    int64_t t_start_sample_us;
 
-    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+    t_start_sample_us = ggml_time_us();
+    const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
 
-    GGML_ASSERT(n_vocab == (int)candidates->size);
+    GGML_ASSERT(n_vocab == candidates->size);
     GGML_ASSERT(!candidates->sorted);
 
-    std::vector<float> logits_base;
-    logits_base.reserve(candidates->size);
-    for (size_t i = 0; i < candidates->size; ++i) {
-        logits_base.push_back(candidates->data[i].logit);
-    }
-    llama_log_softmax(logits_base.data(), candidates->size);
-
-    float* logits_guidance = llama_get_logits(guidance_ctx);
-    llama_log_softmax(logits_guidance, n_vocab);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        float logit_guidance = logits_guidance[i];
-        float logit_base = logits_base[i];
-        candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
+    std::vector<float> logits_base(n_vocab);
+    for (size_t i = 0; i < n_vocab; ++i) {
+        logits_base[i] = candidates->data[i].logit;
     }
 
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    float * logits_guidance = llama_get_logits(guidance_ctx);
+
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
+    t_start_sample_us = ggml_time_us();
+
+    for (size_t i = 0; i < n_vocab; ++i) {
+        candidates->data[i].logit = logits_base[i];
     }
+
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
diff --git a/llama.h b/llama.h
index 79c8335b6..a570b0d69 100644
--- a/llama.h
+++ b/llama.h
@@ -714,14 +714,21 @@ extern "C" {
                            float   penalty_present);
 
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_classifier_free_guidance(
+    /// @param logits Logits extracted from the original generation context.
+    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_apply_guidance(
+              struct llama_context * ctx,
+                             float * logits,
+                             float * logits_guidance,
+                             float   scale);
+
+    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
               struct llama_context * ctx,
             llama_token_data_array * candidates,
               struct llama_context * guidance_ctx,
-                             float   scale);
+                             float   scale),
+              "use llama_sample_apply_guidance() instead");
 
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(

From 3e5ca7931c68152e4ec18d126e9c832dd84914c8 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:40:48 +0200
Subject: [PATCH 036/138] pass cpu-architecture arguments only to host code
 (C;C++) (#4943)

---
 CMakeLists.txt | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2741568ed..7bd640966 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -594,6 +594,13 @@ if (NOT MSVC)
     endif()
 endif()
 
+function(add_compile_option_cpp ARG)
+    # Adds a compile option to C/C++ only, but not for Cuda.
+    # Use, e.g., for CPU-architecture flags.
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
+    add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
+endfunction()
+
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
     message(STATUS "ARM detected")
     if (MSVC)
@@ -628,8 +635,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
             include(cmake/FindSIMD.cmake)
         endif ()
         if (LLAMA_AVX512)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
-            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+            add_compile_option_cpp(/arch:AVX512)
             # MSVC has no compile-time flags enabling specific
             # AVX512 extensions, neither it defines the
             # macros corresponding to the extensions.
@@ -643,37 +649,35 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
             endif()
         elseif (LLAMA_AVX2)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
-            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+            add_compile_option_cpp(/arch:AVX2)
         elseif (LLAMA_AVX)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
-            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
+            add_compile_option_cpp(/arch:AVX)
         endif()
     else()
         if (LLAMA_NATIVE)
-            add_compile_options(-march=native)
+            add_compile_option_cpp(-march=native)
         endif()
         if (LLAMA_F16C)
-            add_compile_options(-mf16c)
+            add_compile_option_cpp(-mf16c)
         endif()
         if (LLAMA_FMA)
-            add_compile_options(-mfma)
+            add_compile_option_cpp(-mfma)
         endif()
         if (LLAMA_AVX)
-            add_compile_options(-mavx)
+            add_compile_option_cpp(-mavx)
         endif()
         if (LLAMA_AVX2)
-            add_compile_options(-mavx2)
+            add_compile_option_cpp(-mavx2)
         endif()
         if (LLAMA_AVX512)
-            add_compile_options(-mavx512f)
-            add_compile_options(-mavx512bw)
+            add_compile_option_cpp(-mavx512f)
+            add_compile_option_cpp(-mavx512bw)
         endif()
         if (LLAMA_AVX512_VBMI)
-            add_compile_options(-mavx512vbmi)
+            add_compile_option_cpp(-mavx512vbmi)
         endif()
         if (LLAMA_AVX512_VNNI)
-            add_compile_options(-mavx512vnni)
+            add_compile_option_cpp(-mavx512vnni)
         endif()
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")

From e0324285a569d0583cf2f4a07a2402221ee25f58 Mon Sep 17 00:00:00 2001
From: stduhpf <stephduh@live.fr>
Date: Tue, 16 Jan 2024 12:04:32 +0100
Subject: [PATCH 037/138] speculative : threading options (#4959)

* speculative: expose draft threading

* fix usage format

* accept -td and -tbd args

* speculative: revert default behavior when -td is unspecified

* fix trailing whitespace
---
 common/common.cpp                    | 22 ++++++++++++++++++++++
 common/common.h                      |  2 ++
 examples/speculative/speculative.cpp |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index c11006bcb..2b0865fff 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads_batch <= 0) {
                 params.n_threads_batch = std::thread::hardware_concurrency();
             }
+        } else if (arg == "-td" || arg == "--threads-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_draft = std::stoi(argv[i]);
+            if (params.n_threads_draft <= 0) {
+                params.n_threads_draft = std::thread::hardware_concurrency();
+            }
+        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_batch_draft = std::stoi(argv[i]);
+            if (params.n_threads_batch_draft <= 0) {
+                params.n_threads_batch_draft = std::thread::hardware_concurrency();
+            }
         } else if (arg == "-p" || arg == "--prompt") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -845,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
+    printf("  -td N, --threads-draft N");
+    printf("                        number of threads to use during generation (default: same as --threads)");
+    printf("  -tbd N, --threads-batch-draft N\n");
+    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
     printf("  -p PROMPT, --prompt PROMPT\n");
     printf("                        prompt to start generation with (default: empty)\n");
     printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
diff --git a/common/common.h b/common/common.h
index 096468243..1f43e6282 100644
--- a/common/common.h
+++ b/common/common.h
@@ -46,7 +46,9 @@ struct gpt_params {
     uint32_t seed                           = -1;    // RNG seed
 
     int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads_draft                 = -1;
     int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft           = -1;
     int32_t n_predict                       = -1;    // new tokens to predict
     int32_t n_ctx                           = 512;   // context size
     int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 20f1fb5bf..7b3af01f3 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
+    if (params.n_threads_draft > 0) {
+        params.n_threads = params.n_threads_draft;
+    }
+    params.n_threads_batch = params.n_threads_batch_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
     {

From d75c232e1da56f19ac4d2530dadbe0ab3a11fde5 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Jan 2024 12:14:19 +0100
Subject: [PATCH 038/138] finetune : use LLAMA_FILE_MAGIC_GGLA (#4961)

This commit replaces the magic number LLAMA_FILE_MAGIC_LORA used in
finetune.cpp with LLAMA_FILE_MAGIC_GGLA defined in llama.h.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/finetune/finetune.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index eaca42fc1..a6620fd73 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
         return tn_buf.data();
     };
 
-    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
     // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
+    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
     file.write_u32(1); // version
     // write_hparams
     file.write_u32(lora->hparams.lora_r);

From a0b3ac8c48b66206b9c5921ce57bd5c0ea6557c3 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Tue, 16 Jan 2024 03:16:33 -0800
Subject: [PATCH 039/138] ggml : introduce GGML_CALL function annotation
 (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
---
 ggml-backend-impl.h |  60 +++++++++++-----------
 ggml-backend.c      |  80 ++++++++++++++---------------
 ggml-backend.h      |  50 +++++++++---------
 ggml-cuda.cu        | 121 ++++++++++++++++++++++----------------------
 ggml-cuda.h         |  32 ++++++------
 ggml-metal.h        |   4 +-
 ggml-metal.m        |  42 +++++++--------
 ggml.c              |  32 ++++++------
 ggml.h              |  58 ++++++++++++---------
 9 files changed, 244 insertions(+), 235 deletions(-)

diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index 1db32901f..1397828d9 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -16,14 +16,14 @@ extern "C" {
     typedef void * ggml_backend_buffer_type_context_t;
 
     struct ggml_backend_buffer_type_i {
-        const char *          (*get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         // check if tensor data is in host memory
         // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
     };
 
     struct ggml_backend_buffer_type {
@@ -35,15 +35,15 @@ extern "C" {
     typedef void * ggml_backend_buffer_context_t;
 
     struct ggml_backend_buffer_i {
-        const char * (*get_name)   (ggml_backend_buffer_t buffer);
-        void         (*free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*get_base)   (ggml_backend_buffer_t buffer);
-        void         (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
     };
 
     struct ggml_backend_buffer {
@@ -54,7 +54,7 @@ extern "C" {
         enum ggml_backend_buffer_usage usage;
     };
 
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                    ggml_backend_buffer_type_t      buft,
             struct ggml_backend_buffer_i           iface,
                    ggml_backend_buffer_context_t   context,
@@ -70,31 +70,31 @@ extern "C" {
     typedef void * ggml_backend_context_t;
 
     struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
 
-        void (*free)(ggml_backend_t backend);
+        void (*GGML_CALL free)(ggml_backend_t backend);
 
         // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
 
         // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
         // (optional) complete all pending operations
-        void (*synchronize)(ggml_backend_t backend);
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
 
         // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph without a plan (async)
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
     };
 
     struct ggml_backend {
@@ -107,9 +107,9 @@ extern "C" {
     // Backend registry
     //
 
-    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
 
-    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-backend.c b/ggml-backend.c
index 505dbba47..f5424fb90 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name(buft);
 }
 
-ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     return buft->iface.alloc_buffer(buft, size);
 }
 
@@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_alignment(buft);
 }
 
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
         return buft->iface.get_alloc_size(buft, tensor);
@@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
 
 // backend buffer
 
-ggml_backend_buffer_t ggml_backend_buffer_init(
+GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                ggml_backend_buffer_type_t      buft,
         struct ggml_backend_buffer_i           iface,
                ggml_backend_buffer_context_t   context,
@@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
     return base;
 }
 
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     // init_tensor is optional
     if (buffer->iface.init_tensor) {
         buffer->iface.init_tensor(buffer, tensor);
@@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
     }
 }
 
-void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
     tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
 }
 
-void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -318,9 +318,9 @@ struct ggml_backend_reg {
 static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
 static size_t ggml_backend_registry_count = 0;
 
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
 
-static void ggml_backend_registry_init(void) {
+GGML_CALL static void ggml_backend_registry_init(void) {
     static bool initialized = false;
 
     if (initialized) {
@@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
 
     // add forward decls here to avoid including the backend headers
 #ifdef GGML_USE_CUBLAS
-    extern void ggml_backend_cuda_reg_devices(void);
+    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
     ggml_backend_cuda_reg_devices();
 #endif
 
 #ifdef GGML_USE_METAL
-    extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
+    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
     ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
 }
 
-void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
     GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
 
     size_t id = ggml_backend_registry_count;
@@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
 
 // backend CPU
 
-static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
     return "CPU";
 
     GGML_UNUSED(buffer);
 }
 
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
 
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     free(buffer->context);
 }
 
-static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     memcpy(data, (const char *)tensor->data + offset, size);
 
     GGML_UNUSED(buffer);
 }
 
-static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
@@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     memset(buffer->context, value, buffer->size);
 }
 
@@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
-static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
 
     GGML_UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
 
@@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
     return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
 
-static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return TENSOR_ALIGNMENT;
 
     GGML_UNUSED(buft);
 }
 
-static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_cpu(backend);
 
     GGML_UNUSED(buft);
 }
 
-static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
     GGML_UNUSED(buft);
 }
 
-ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
@@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
 
 #include <hbwmalloc.h>
 
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU_HBM";
 
     GGML_UNUSED(buft);
 }
 
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
     return "CPU_HBM";
 
     GGML_UNUSED(buf);
 }
 
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     hbw_free(buffer->context);
 }
 
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     //void * ptr = hbw_malloc(size);
     void * ptr;
     int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
     size_t work_size;
 };
 
-static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
     return "CPU";
 
     GGML_UNUSED(backend);
 }
 
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     free(cpu_ctx->work_data);
     free(cpu_ctx);
     free(backend);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
     return ggml_backend_cpu_buffer_type();
 
     GGML_UNUSED(backend);
@@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
     struct ggml_cgraph cgraph;
 };
 
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
     return cpu_plan;
 }
 
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     free(cpu_plan->cplan.work_data);
@@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
     GGML_UNUSED(backend);
 }
 
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
     GGML_UNUSED(backend);
 }
 
-static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
     return true;
 }
 
-static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
 
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_cpu_name;
 }
 
@@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
     return ggml_backend_cpu_init();
 
     GGML_UNUSED(params);
diff --git a/ggml-backend.h b/ggml-backend.h
index 4eb244af1..12b4b4ab7 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -17,12 +17,12 @@ extern "C" {
     //
 
     // buffer type
-    GGML_API const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
 
     // buffer
     enum ggml_backend_buffer_usage {
@@ -30,18 +30,18 @@ extern "C" {
         GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
     };
 
-    GGML_API const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
 
     //
     // Backend
@@ -58,8 +58,8 @@ extern "C" {
     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 
@@ -80,13 +80,13 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
     // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 
 #ifdef GGML_USE_CPU_HBM
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@@ -183,7 +183,7 @@ extern "C" {
     GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
 
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends
     GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c3e14bc96..568c411af 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7615,11 +7615,11 @@ struct cuda_pool_alloc {
 
 static bool g_cublas_loaded = false;
 
-bool ggml_cublas_loaded(void) {
+GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
-void ggml_init_cublas() {
+GGML_CALL void ggml_init_cublas() {
     static bool initialized = false;
 
     if (!initialized) {
@@ -7707,7 +7707,7 @@ void ggml_init_cublas() {
     }
 }
 
-void * ggml_cuda_host_malloc(size_t size) {
+GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
     if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
         return nullptr;
     }
@@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
     return ptr;
 }
 
-void ggml_cuda_host_free(void * ptr) {
+GGML_CALL void ggml_cuda_host_free(void * ptr) {
     CUDA_CHECK(cudaFreeHost(ptr));
 }
 
@@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
 
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     if (!g_cublas_loaded) return false;
 
     const int64_t ne10 = src1->ne[0];
@@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
     return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 
-static void ggml_cuda_set_main_device(const int main_device) {
+GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
     if (main_device >= g_device_count) {
         fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                 main_device, g_device_count, g_main_device);
@@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
     }
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     if (!g_cublas_loaded) return false;
 
     ggml_cuda_func_t func;
@@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     return true;
 }
 
-int ggml_cuda_get_device_count() {
+GGML_CALL int ggml_cuda_get_device_count() {
     int device_count;
     if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
         return 0;
@@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() {
     return device_count;
 }
 
-void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
+GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
@@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context {
     }
 };
 
-static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     return ctx->name.c_str();
 }
 
-static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
     return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
 }
 
-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     CUDA_CHECK(cudaFree(ctx->dev_ptr));
     delete ctx;
 }
 
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     return ctx->dev_ptr;
 }
 
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
@@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
     }
 }
 
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
     CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
     CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
     if (ggml_backend_buffer_is_cuda(src->buffer)) {
         ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
         ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
     return false;
 }
 
-static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
@@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 };
 
 // cuda buffer type
-
 struct ggml_backend_cuda_buffer_type_context {
     int device;
     std::string name;
 };
 
-static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
     ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     return ctx->name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     ggml_cuda_set_device(buft_ctx->device);
@@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
 
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     int64_t row_low = 0;
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
@@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
     UNUSED(buft);
 }
 
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     if (!ggml_backend_is_cuda(backend)) {
         return false;
     }
@@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .is_host          = */ NULL,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
     // FIXME: this is not thread safe
     if (device >= ggml_backend_cuda_get_device_count()) {
         return nullptr;
@@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context {
     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 };
 
-static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
     return GGML_CUDA_NAME "_Split";
 
     UNUSED(buffer);
@@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
 //    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
 //}
 
-static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
 }
 
-static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
     // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
     return (void *)0x1000;
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
     tensor->extra = extra;
 }
 
-static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
     }
 }
 
-static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
     }
 }
 
-static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     UNUSED(buffer);
     UNUSED(value);
 }
@@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 
 // cuda split buffer type
 
-static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
     return GGML_CUDA_NAME "_Split";
 
     UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
     // instead, we allocate them for each tensor separately in init_tensor
     // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
 }
 
-static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
 
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
 
     size_t total_size = 0;
@@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
     return total_size;
 }
 
-static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_cuda(backend);
 
     UNUSED(buft);
 }
 
-static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return false;
 
     UNUSED(buft);
@@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
     /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
     // FIXME: this is not thread safe
     static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
 
@@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
 
 // host buffer type
 
-static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
     return GGML_CUDA_NAME "_Host";
 
     UNUSED(buft);
 }
 
-static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
     return GGML_CUDA_NAME "_Host";
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_cuda_host_free(buffer->context);
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     void * ptr = ggml_cuda_host_malloc(size);
 
     if (ptr == nullptr) {
@@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
     return buffer;
 }
 
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
         /* .iface    = */ {
             /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
@@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
 
 // backend
 
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     return cuda_ctx->name.c_str();
 }
 
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     delete cuda_ctx;
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     return ggml_backend_cuda_buffer_type(cuda_ctx->device);
 }
 
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
 }
 
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
     CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
 }
 
-static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
@@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm
     return false;
 }
 
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
@@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     UNUSED(backend);
 }
 
-static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     ggml_cuda_set_main_device(cuda_ctx->device);
@@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     return true;
 }
 
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
 };
 
-ggml_backend_t ggml_backend_cuda_init(int device) {
+GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
     ggml_init_cublas(); // TODO: remove from ggml.c
 
     if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
     return cuda_backend;
 }
 
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_cuda_name;
 }
 
-int ggml_backend_cuda_get_device_count() {
+GGML_CALL int ggml_backend_cuda_get_device_count() {
     return ggml_cuda_get_device_count();
 }
 
-void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
+GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
     ggml_cuda_get_device_description(device, description, description_size);
 }
 
-void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
+GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
     ggml_cuda_set_device(device);
 
     CUDA_CHECK(cudaMemGetInfo(free, total));
 }
 
 // backend registry
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
+GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
     ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
     return cuda_backend;
 
     UNUSED(params);
 }
 
-extern "C" int ggml_backend_cuda_reg_devices();
+extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
 
-int ggml_backend_cuda_reg_devices() {
+GGML_CALL int ggml_backend_cuda_reg_devices() {
     int device_count = ggml_cuda_get_device_count();
     //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
     for (int i = 0; i < device_count; i++) {
diff --git a/ggml-cuda.h b/ggml-cuda.h
index d19cbf3fd..b1ebd61d7 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -18,34 +18,34 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 
 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API void   ggml_init_cublas(void);
+GGML_API GGML_CALL void   ggml_init_cublas(void);
 
 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool   ggml_cublas_loaded(void);
+GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
 
-GGML_API void * ggml_cuda_host_malloc(size_t size);
-GGML_API void   ggml_cuda_host_free(void * ptr);
+GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
+GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
 
-GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
-GGML_API int    ggml_cuda_get_device_count(void);
-GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
+GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
 
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
-GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-metal.h b/ggml-metal.h
index cd5e2995f..8b0bfc5f1 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
 
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
diff --git a/ggml-metal.m b/ggml-metal.m
index 2ca726055..867f2fd48 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) {
     }
 }
 
-static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
     return "Metal";
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     for (int i = 0; i < ctx->n_buffers; i++) {
@@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     free(ctx);
 }
 
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     return ctx->all_data;
 }
 
-static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     memcpy(data, (const char *)tensor->data + offset, size);
 
     UNUSED(buffer);
 }
 
-static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
@@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     memset(ctx->all_data, value, ctx->all_size);
@@ -2363,13 +2363,13 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
 
 // default buffer type
 
-static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "Metal";
 
     UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }
 
-static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 32;
     UNUSED(buft);
 }
 
-static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
 
     UNUSED(buft);
 }
 
-static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
     UNUSED(buft);
 }
 
-ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
@@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 
 // buffer from ptr
 
-ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
     ctx->all_data = data;
@@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
 
 // backend
 
-static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
     return "Metal";
 
     UNUSED(backend);
 }
 
-static void ggml_backend_metal_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
     struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
     ggml_metal_free(ctx);
     free(backend);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
     return ggml_backend_metal_buffer_type();
 
     UNUSED(backend);
 }
 
-static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
     return ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
     return ggml_metal_supports_op(metal_ctx, op);
@@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
     return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
+GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
+GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
     return ggml_backend_metal_init();
 
     GGML_UNUSED(params);
diff --git a/ggml.c b/ggml.c
index ef5888ab2..5779f32d2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
     GGML_PRINT("%s: --- end ---\n", __func__);
 }
 
-int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     size_t nbytes;
     size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
     return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
 }
 
-int ggml_blck_size(enum ggml_type type) {
+GGML_CALL int ggml_blck_size(enum ggml_type type) {
     return type_traits[type].blck_size;
 }
 
-size_t ggml_type_size(enum ggml_type type) {
+GGML_CALL size_t ggml_type_size(enum ggml_type type) {
     return type_traits[type].type_size;
 }
 
-size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
     assert(ne % ggml_blck_size(type) == 0);
     return ggml_type_size(type)*ne/ggml_blck_size(type);
 }
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
     return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
 
-const char * ggml_type_name(enum ggml_type type) {
+GGML_CALL const char * ggml_type_name(enum ggml_type type) {
     return type_traits[type].type_name;
 }
 
-bool ggml_is_quantized(enum ggml_type type) {
+GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
     return type_traits[type].is_quantized;
 }
 
-const char * ggml_op_name(enum ggml_op op) {
+GGML_CALL const char * ggml_op_name(enum ggml_op op) {
     return GGML_OP_NAME[op];
 }
 
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
     return GGML_UNARY_OP_NAME[op];
 }
 
-const char * ggml_op_desc(const struct ggml_tensor * t) {
+GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
     if (t->op == GGML_OP_UNARY) {
         enum ggml_unary_op uop = ggml_get_unary_op(t);
         return ggml_unary_op_name(uop);
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
     }
 }
 
-size_t ggml_element_size(const struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
     return ggml_type_size(tensor->type);
 }
 
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
     return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
 }
 
-bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
 
-bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-bool ggml_is_permuted(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
     return (float *)(tensor->data);
 }
 
-enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
+GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->op == GGML_OP_UNARY);
     return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
 }
@@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init(
     }
 }
 
-void ggml_rope_yarn_corr_dims(
+GGML_CALL void ggml_rope_yarn_corr_dims(
     int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
     // start and end correction dims
diff --git a/ggml.h b/ggml.h
index 1187074f7..837c52e68 100644
--- a/ggml.h
+++ b/ggml.h
@@ -187,6 +187,16 @@
 #    define GGML_API
 #endif
 
+#ifdef GGML_MULTIPLATFORM
+#    if defined(_WIN32)
+#        define GGML_CALL
+#    else
+#        define GGML_CALL __attribute__((__ms_abi__))
+#    endif
+#else
+#    define GGML_CALL
+#endif
+
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -649,41 +659,41 @@ extern "C" {
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
-    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
 
-    GGML_API int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
 
     GGML_DEPRECATED(
     GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
     "use ggml_row_size() instead");
 
-    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
+    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
 
-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
 
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
 
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
 
     // TODO: temporary until model loading of ggml examples is refactored
     GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
 
-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
 
     GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
@@ -770,7 +780,7 @@ extern "C" {
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
 
     GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@@ -1413,7 +1423,7 @@ extern "C" {
             float                 beta_slow);
 
     // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_CALL void ggml_rope_yarn_corr_dims(
         int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // xPos RoPE, in-place, returns view(a)

From 122ed4840cc6d209df6043e027f9f8a03aee01da Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Tue, 16 Jan 2024 13:10:48 +0100
Subject: [PATCH 040/138] examples : fix and improv docs for the grammar
 generator (#4909)

* Create pydantic-models-to-grammar.py

* Added some comments for usage

* Refactored Grammar Generator

Added example and usage instruction.

* Update pydantic_models_to_grammar.py

* Update pydantic-models-to-grammar-examples.py

* Renamed module and imported it.

* Update pydantic-models-to-grammar.py

* Renamed file and fixed grammar generator issue.

* Fixed some issues and bugs of the grammar generator. Imporved Documentation

* Update pydantic_models_to_grammar.py
---
 examples/pydantic_models_to_grammar.py | 877 +++++++++++++++----------
 1 file changed, 519 insertions(+), 358 deletions(-)

diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 41b98fdc1..848c1c367 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -4,6 +4,7 @@ from copy import copy
 from inspect import isclass, getdoc
 from types import NoneType
 
+from docstring_parser import parse
 from pydantic import BaseModel, create_model, Field
 from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
 from enum import Enum
@@ -25,9 +26,10 @@ class PydanticDataType(Enum):
         ENUM (str): Represents an enum data type.
         CUSTOM_CLASS (str): Represents a custom class data type.
     """
+
     STRING = "string"
     TRIPLE_QUOTED_STRING = "triple_quoted_string"
-    MARKDOWN_STRING = "markdown_string"
+    MARKDOWN_CODE_BLOCK = "markdown_code_block"
     BOOLEAN = "boolean"
     INTEGER = "integer"
     FLOAT = "float"
@@ -78,10 +80,10 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
 
 
 def format_model_and_field_name(model_name: str) -> str:
-    parts = re.findall('[A-Z][^A-Z]*', model_name)
+    parts = re.findall("[A-Z][^A-Z]*", model_name)
     if not parts:  # Check if the list is empty
         return model_name.lower().replace("_", "-")
-    return '-'.join(part.lower().replace("_", "-") for part in parts)
+    return "-".join(part.lower().replace("_", "-") for part in parts)
 
 
 def generate_list_rule(element_type):
@@ -93,29 +95,31 @@ def generate_list_rule(element_type):
     """
     rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list"
     element_rule = map_pydantic_type_to_gbnf(element_type)
-    list_rule = fr'{rule_name} ::= "["  {element_rule} (","  {element_rule})* "]"'
+    list_rule = rf'{rule_name} ::= "["  {element_rule} (","  {element_rule})* "]"'
     return list_rule
 
 
 def get_members_structure(cls, rule_name):
     if issubclass(cls, Enum):
         # Handle Enum types
-        members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()]
+        members = [f'"\\"{member.value}\\""' for name, member in cls.__members__.items()]
         return f"{cls.__name__.lower()} ::= " + " | ".join(members)
     if cls.__annotations__ and cls.__annotations__ != {}:
         result = f'{rule_name} ::= "{{"'
         type_list_rules = []
         # Modify this comprehension
-        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param_type)}'
-                   for name, param_type in cls.__annotations__.items()
-                   if name != 'self']
+        members = [
+            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
+            for name, param_type in cls.__annotations__.items()
+            if name != "self"
+        ]
 
         result += '"," '.join(members)
         result += '  "}"'
         return result, type_list_rules
     elif rule_name == "custom-class-any":
-        result = f'{rule_name} ::= '
-        result += 'value'
+        result = f"{rule_name} ::= "
+        result += "value"
         type_list_rules = []
         return result, type_list_rules
     else:
@@ -124,9 +128,11 @@ def get_members_structure(cls, rule_name):
         result = f'{rule_name} ::=  "{{"'
         type_list_rules = []
         # Modify this comprehension too
-        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
-                   for name, param in parameters.items()
-                   if name != 'self' and param.annotation != inspect.Parameter.empty]
+        members = [
+            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+            for name, param in parameters.items()
+            if name != "self" and param.annotation != inspect.Parameter.empty
+        ]
 
         result += '", "'.join(members)
         result += '  "}"'
@@ -141,8 +147,8 @@ def regex_to_gbnf(regex_pattern: str) -> str:
     gbnf_rule = regex_pattern
 
     # Translate common regex components to GBNF
-    gbnf_rule = gbnf_rule.replace('\\d', '[0-9]')
-    gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]')
+    gbnf_rule = gbnf_rule.replace("\\d", "[0-9]")
+    gbnf_rule = gbnf_rule.replace("\\s", "[ \t\n]")
 
     # Handle quantifiers and other regex syntax that is similar in GBNF
     # (e.g., '*', '+', '?', character classes)
@@ -158,12 +164,12 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None):
     Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits.
 
     Parameters:
-    max_digit (int): The maximum number of digits for the integer. Default is None.
-    min_digit (int): The minimum number of digits for the integer. Default is None.
+        max_digit (int): The maximum number of digits for the integer. Default is None.
+        min_digit (int): The minimum number of digits for the integer. Default is None.
 
     Returns:
-    integer_rule (str): The identifier for the integer rule generated.
-    additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits.
+        integer_rule (str): The identifier for the integer rule generated.
+        additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits.
 
     """
     additional_rules = []
@@ -178,21 +184,21 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None):
     # Handling Integer Rules
     if max_digit is not None or min_digit is not None:
         # Start with an empty rule part
-        integer_rule_part = ''
+        integer_rule_part = ""
 
         # Add mandatory digits as per min_digit
         if min_digit is not None:
-            integer_rule_part += '[0-9] ' * min_digit
+            integer_rule_part += "[0-9] " * min_digit
 
         # Add optional digits up to max_digit
         if max_digit is not None:
             optional_digits = max_digit - (min_digit if min_digit is not None else 0)
-            integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)])
+            integer_rule_part += "".join(["[0-9]? " for _ in range(optional_digits)])
 
         # Trim the rule part and append it to additional rules
         integer_rule_part = integer_rule_part.strip()
         if integer_rule_part:
-            additional_rules.append(f'{integer_rule} ::= {integer_rule_part}')
+            additional_rules.append(f"{integer_rule} ::= {integer_rule_part}")
 
     return integer_rule, additional_rules
 
@@ -224,21 +230,26 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
     additional_rules = []
 
     # Define the integer part rule
-    integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
+    integer_part_rule = (
+        "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
         f"-min{min_digit}" if min_digit is not None else "")
+    )
 
     # Define the fractional part rule based on precision constraints
     fractional_part_rule = "fractional-part"
-    fractional_rule_part = ''
+    fractional_rule_part = ""
     if max_precision is not None or min_precision is not None:
         fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + (
-            f"-min{min_precision}" if min_precision is not None else "")
+            f"-min{min_precision}" if min_precision is not None else ""
+        )
         # Minimum number of digits
-        fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1)
+        fractional_rule_part = "[0-9]" * (min_precision if min_precision is not None else 1)
         # Optional additional digits
-        fractional_rule_part += ''.join([' [0-9]?'] * (
-            (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0))
-        additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}')
+        fractional_rule_part += "".join(
+            [" [0-9]?"] * ((max_precision - (
+                min_precision if min_precision is not None else 1)) if max_precision is not None else 0)
+        )
+        additional_rules.append(f"{fractional_part_rule} ::= {fractional_rule_part}")
 
     # Define the float rule
     float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}"
@@ -246,20 +257,19 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
 
     # Generating the integer part rule definition, if necessary
     if max_digit is not None or min_digit is not None:
-        integer_rule_part = '[0-9]'
+        integer_rule_part = "[0-9]"
         if min_digit is not None and min_digit > 1:
-            integer_rule_part += ' [0-9]' * (min_digit - 1)
+            integer_rule_part += " [0-9]" * (min_digit - 1)
         if max_digit is not None:
-            integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1)))
-        additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}')
+            integer_rule_part += "".join([" [0-9]?"] * (max_digit - (min_digit if min_digit is not None else 1)))
+        additional_rules.append(f"{integer_part_rule} ::= {integer_rule_part.strip()}")
 
     return float_rule, additional_rules
 
 
-def generate_gbnf_rule_for_type(model_name, field_name,
-                                field_type, is_optional, processed_models, created_rules,
-                                field_info=None) -> \
-    Tuple[str, list]:
+def generate_gbnf_rule_for_type(
+    model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
+) -> Tuple[str, list]:
     """
     Generate GBNF rule for a given field type.
 
@@ -282,20 +292,19 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
     if isclass(field_type) and issubclass(field_type, BaseModel):
         nested_model_name = format_model_and_field_name(field_type.__name__)
-        nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules)
+        nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
         rules.extend(nested_model_rules)
         gbnf_type, rules = nested_model_name, rules
     elif isclass(field_type) and issubclass(field_type, Enum):
-        enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type]  # Adding escaped quotes
+        enum_values = [f'"\\"{e.value}\\""' for e in field_type]  # Adding escaped quotes
         enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
         rules.append(enum_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
-    elif get_origin(field_type) == list or field_type == list:  # Array
+    elif get_origin(field_type) == list:  # Array
         element_type = get_args(field_type)[0]
-        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                          f"{field_name}-element",
-                                                                          element_type, is_optional, processed_models,
-                                                                          created_rules)
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
+        )
         rules.extend(additional_rules)
         array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
         rules.append(array_rule)
@@ -303,10 +312,9 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
     elif get_origin(field_type) == set or field_type == set:  # Array
         element_type = get_args(field_type)[0]
-        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                          f"{field_name}-element",
-                                                                          element_type, is_optional, processed_models,
-                                                                          created_rules)
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
+        )
         rules.extend(additional_rules)
         array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
         rules.append(array_rule)
@@ -318,15 +326,13 @@ def generate_gbnf_rule_for_type(model_name, field_name,
     elif gbnf_type.startswith("custom-dict-"):
         key_type, value_type = get_args(field_type)
 
-        additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name,
-                                                                                f"{field_name}-key-type",
-                                                                                key_type, is_optional, processed_models,
-                                                                                created_rules)
-        additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name,
-                                                                                    f"{field_name}-value-type",
-                                                                                    value_type, is_optional,
-                                                                                    processed_models, created_rules)
-        gbnf_type = fr'{gbnf_type} ::= "{{"  ( {additional_key_type} ":"  {additional_value_type} (","  {additional_key_type} ":"  {additional_value_type})*  )? "}}" '
+        additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-key-type", key_type, is_optional, processed_models, created_rules
+        )
+        additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-value-type", value_type, is_optional, processed_models, created_rules
+        )
+        gbnf_type = rf'{gbnf_type} ::= "{{"  ( {additional_key_type} ": "  {additional_value_type} ("," "\n" ws {additional_key_type} ":"  {additional_value_type})*  )? "}}" '
 
         rules.extend(additional_key_rules)
         rules.extend(additional_value_rules)
@@ -336,19 +342,16 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
         for union_type in union_types:
             if isinstance(union_type, _GenericAlias):
-                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
-                                                                                field_name, union_type,
-                                                                                False,
-                                                                                processed_models, created_rules)
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
+                    model_name, field_name, union_type, False, processed_models, created_rules
+                )
                 union_rules.append(union_gbnf_type)
                 rules.extend(union_rules_list)
 
-
             elif not issubclass(union_type, NoneType):
-                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
-                                                                                field_name, union_type,
-                                                                                False,
-                                                                                processed_models, created_rules)
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
+                    model_name, field_name, union_type, False, processed_models, created_rules
+                )
                 union_rules.append(union_gbnf_type)
                 rules.extend(union_rules_list)
 
@@ -363,45 +366,58 @@ def generate_gbnf_rule_for_type(model_name, field_name,
         else:
             gbnf_type = f"{model_name}-{field_name}-union"
     elif isclass(field_type) and issubclass(field_type, str):
-        if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None:
-
-            triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False)
-            markdown_string = field_info.json_schema_extra.get('markdown_string', False)
+        if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
+            triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
+            markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
 
             gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value
-            gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type
+            gbnf_type = PydanticDataType.MARKDOWN_CODE_BLOCK.value if markdown_string else gbnf_type
 
-        elif field_info and hasattr(field_info, 'pattern'):
+        elif field_info and hasattr(field_info, "pattern"):
             # Convert regex pattern to grammar rule
             regex_pattern = field_info.regex.pattern
             gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}"
         else:
             gbnf_type = PydanticDataType.STRING.value
 
-    elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info,
-                                                                                          'json_schema_extra') and field_info.json_schema_extra is not None:
+    elif (
+        isclass(field_type)
+        and issubclass(field_type, float)
+        and field_info
+        and hasattr(field_info, "json_schema_extra")
+        and field_info.json_schema_extra is not None
+    ):
         # Retrieve precision attributes for floats
-        max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info,
-                                                                                                    'json_schema_extra') else None
-        min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info,
-                                                                                                    'json_schema_extra') else None
-        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
-        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
+        max_precision = (
+            field_info.json_schema_extra.get("max_precision") if field_info and hasattr(field_info,
+                                                                                        "json_schema_extra") else None
+        )
+        min_precision = (
+            field_info.json_schema_extra.get("min_precision") if field_info and hasattr(field_info,
+                                                                                        "json_schema_extra") else None
+        )
+        max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
+        min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
 
         # Generate GBNF rule for float with given attributes
-        gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits,
-                                                     max_precision=max_precision,
-                                                     min_precision=min_precision)
+        gbnf_type, rules = generate_gbnf_float_rules(
+            max_digit=max_digits, min_digit=min_digits, max_precision=max_precision, min_precision=min_precision
+        )
 
-    elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info,
-                                                                                        'json_schema_extra') and field_info.json_schema_extra is not None:
+    elif (
+        isclass(field_type)
+        and issubclass(field_type, int)
+        and field_info
+        and hasattr(field_info, "json_schema_extra")
+        and field_info.json_schema_extra is not None
+    ):
         # Retrieve digit attributes for integers
-        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
-        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
+        max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
+        min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
 
         # Generate GBNF rule for integer with given attributes
         gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits)
@@ -443,13 +459,13 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
 
     if not issubclass(model, BaseModel):
         # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
-        if hasattr(model, '__annotations__') and model.__annotations__:
+        if hasattr(model, "__annotations__") and model.__annotations__:
             model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
         else:
             init_signature = inspect.signature(model.__init__)
             parameters = init_signature.parameters
-            model_fields = {name: (param.annotation, param.default) for name, param in parameters.items()
-                            if name != 'self'}
+            model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() if
+                            name != "self"}
     else:
         # For Pydantic models, use model_fields and check for ellipsis (required fields)
         model_fields = model.__annotations__
@@ -469,51 +485,55 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
             field_type = field_info
             field_info = model.model_fields[field_name]
             is_optional = field_info.is_required is False and get_origin(field_type) is Optional
-        rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                  format_model_and_field_name(field_name),
-                                                                  field_type, is_optional,
-                                                                  processed_models, created_rules, field_info)
-        look_for_markdown_code_block = True if rule_name == "markdown_string" else False
+        rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, format_model_and_field_name(field_name), field_type, is_optional, processed_models,
+            created_rules, field_info
+        )
+        look_for_markdown_code_block = True if rule_name == "markdown_code_block" else False
         look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False
         if not look_for_markdown_code_block and not look_for_triple_quoted_string:
             if rule_name not in created_rules:
                 created_rules[rule_name] = additional_rules
-            model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": "  {rule_name}')  # Adding escaped quotes
+            model_rule_parts.append(f' ws "\\"{field_name}\\"" ":" ws {rule_name}')  # Adding escaped quotes
             nested_rules.extend(additional_rules)
         else:
-            has_triple_quoted_string = look_for_markdown_code_block
-            has_markdown_code_block = look_for_triple_quoted_string
+            has_triple_quoted_string = look_for_triple_quoted_string
+            has_markdown_code_block = look_for_markdown_code_block
 
     fields_joined = r' "," "\n" '.join(model_rule_parts)
-    model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"'
-
-    if look_for_markdown_code_block or look_for_triple_quoted_string:
-        model_rule += ' ws "}"'
+    model_rule = rf'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"'
 
+    has_special_string = False
     if has_triple_quoted_string:
+        model_rule += '"\\n" ws "}"'
         model_rule += '"\\n" triple-quoted-string'
+        has_special_string = True
     if has_markdown_code_block:
+        model_rule += '"\\n" ws "}"'
         model_rule += '"\\n" markdown-code-block'
+        has_special_string = True
     all_rules = [model_rule] + nested_rules
 
-    return all_rules, has_markdown_code_block, has_triple_quoted_string
+    return all_rules, has_special_string
 
 
-def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None,
-                                               outer_object_content: str = None, list_of_outputs: bool = False) -> str:
+def generate_gbnf_grammar_from_pydantic_models(
+    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    list_of_outputs: bool = False
+) -> str:
     """
     Generate GBNF Grammar from Pydantic Models.
 
     This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated
     * grammar.
 
-    Parameters:
-    models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    list_of_outputs (str, optional): Allows a list of output objects
+    Args:
+        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        list_of_outputs (str, optional): Allows a list of output objects
     Returns:
-    str: The generated GBNF grammar string.
+        str: The generated GBNF grammar string.
 
     Examples:
         models = [UserModel, PostModel]
@@ -527,52 +547,53 @@ def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], ou
     all_rules = []
     created_rules = {}
     if outer_object_name is None:
-
         for model in models:
-            model_rules, _, _ = generate_gbnf_grammar(model,
-                                                      processed_models, created_rules)
+            model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
             all_rules.extend(model_rules)
 
         if list_of_outputs:
-            root_rule = r'root ::= ws "["  grammar-models (","  grammar-models)*  "]"' + "\n"
+            root_rule = r'root ::= (" "| "\n") "[" ws grammar-models ("," ws grammar-models)* ws "]"' + "\n"
         else:
-            root_rule = r'root ::= ws grammar-models' + "\n"
+            root_rule = r'root ::= (" "| "\n") grammar-models' + "\n"
         root_rule += "grammar-models ::= " + " | ".join(
             [format_model_and_field_name(model.__name__) for model in models])
         all_rules.insert(0, root_rule)
         return "\n".join(all_rules)
     elif outer_object_name is not None:
         if list_of_outputs:
-            root_rule = fr'root ::= ws "["  {format_model_and_field_name(outer_object_name)} (","  {format_model_and_field_name(outer_object_name)})*  "]"' + "\n"
+            root_rule = (
+                rf'root ::= (" "| "\n") "[" ws {format_model_and_field_name(outer_object_name)} ("," ws {format_model_and_field_name(outer_object_name)})* ws "]"'
+                + "\n"
+            )
         else:
             root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n"
 
-        model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\""  ": "  grammar-models'
+        model_rule = (
+            rf'{format_model_and_field_name(outer_object_name)} ::= (" "| "\n") "{{" ws "\"{outer_object_name}\""  ":" ws grammar-models'
+        )
 
         fields_joined = " | ".join(
-            [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models])
+            [rf"{format_model_and_field_name(model.__name__)}-grammar-model" for model in models])
 
-        grammar_model_rules = f'\ngrammar-models ::= {fields_joined}'
+        grammar_model_rules = f"\ngrammar-models ::= {fields_joined}"
         mod_rules = []
         for model in models:
-            mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws'
-            mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n'
+            mod_rule = rf"{format_model_and_field_name(model.__name__)}-grammar-model ::= "
+            mod_rule += (
+                rf'"\"{model.__name__}\"" "," ws "\"{outer_object_content}\"" ":" ws {format_model_and_field_name(model.__name__)}' + "\n"
+            )
             mod_rules.append(mod_rule)
         grammar_model_rules += "\n" + "\n".join(mod_rules)
-        look_for_markdown_code_block = False
-        look_for_triple_quoted_string = False
+
         for model in models:
-            model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model,
-                                                                                      processed_models, created_rules)
+            model_rules, has_special_string = generate_gbnf_grammar(model, processed_models,
+                                                                    created_rules)
+
+            if not has_special_string:
+                model_rules[0] += r'"\n" ws "}"'
+
             all_rules.extend(model_rules)
-            if markdown_block:
-                look_for_markdown_code_block = True
 
-            if triple_quoted_string:
-                look_for_triple_quoted_string = True
-
-        if not look_for_markdown_code_block and not look_for_triple_quoted_string:
-            model_rule += ' ws "}"'
         all_rules.insert(0, root_rule + model_rule + grammar_model_rules)
         return "\n".join(all_rules)
 
@@ -582,10 +603,10 @@ def get_primitive_grammar(grammar):
     Returns the needed GBNF primitive grammar for a given GBNF grammar string.
 
     Args:
-    grammar (str): The string containing the GBNF grammar.
+        grammar (str): The string containing the GBNF grammar.
 
     Returns:
-    str: GBNF primitive grammar string.
+        str: GBNF primitive grammar string.
     """
     type_list = []
     if "string-list" in grammar:
@@ -611,7 +632,7 @@ integer ::= [0-9]+"""
 
     any_block = ""
     if "custom-class-any" in grammar:
-        any_block = '''
+        any_block = """
 value ::= object | array | string | number | boolean | null
 
 object ::=
@@ -626,7 +647,7 @@ array  ::=
     ("," ws value)*
   )? "]" ws
 
-number ::= integer | float'''
+number ::= integer | float"""
 
     markdown_code_block_grammar = ""
     if "markdown-code-block" in grammar:
@@ -641,90 +662,32 @@ closing-triple-ticks ::= "```" "\n"'''
 triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes
 triple-quoted-string-content ::= ( [^'] | "'" [^'] |  "'"  "'" [^']  )*
 triple-quotes ::= "'''" """
-    return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar
+    return "\n" + "\n".join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar
 
 
-def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str:
-    indent = '  ' * depth
-    field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): "
-
-    # Extracting field description from Pydantic Field using __model_fields__
-    field_info = model.model_fields.get(field_name)
-    field_description = field_info.description if field_info and field_info.description else "No description available."
-
-    field_markdown += field_description + '\n'
-
-    # Handling nested BaseModel fields
-    if isclass(field_type) and issubclass(field_type, BaseModel):
-        field_markdown += f"{indent}  - Details:\n"
-        for name, type_ in field_type.__annotations__.items():
-            field_markdown += generate_field_markdown(name, type_, field_type, depth + 2)
-
-    return field_markdown
-
-
-def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str:
-    markdown = ""
-    for model in pydantic_models:
-        markdown += f"### {format_model_and_field_name(model.__name__)}\n"
-
-        # Check if the model's docstring is different from BaseModel's docstring
-        class_doc = getdoc(model)
-        base_class_doc = getdoc(BaseModel)
-        class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available."
-
-        markdown += f"{class_description}\n\n"
-        markdown += "#### Fields\n"
-
-        if isclass(model) and issubclass(model, BaseModel):
-            for name, field_type in model.__annotations__.items():
-                markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model)
-        markdown += "\n"
-
-    return markdown
-
-
-def format_json_example(example: dict, depth: int) -> str:
+def generate_markdown_documentation(
+    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    documentation_with_field_description=True
+) -> str:
     """
-    Format a JSON example into a readable string with indentation.
+    Generate markdown documentation for a list of Pydantic models.
 
     Args:
-    example (dict): JSON example to be formatted.
-    depth (int): Indentation depth.
+        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        model_prefix (str): Prefix for the model section.
+        fields_prefix (str): Prefix for the fields section.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    str: Formatted JSON example string.
-    """
-    indent = '    ' * depth
-    formatted_example = '{\n'
-    for key, value in example.items():
-        value_text = f"'{value}'" if isinstance(value, str) else value
-        formatted_example += f"{indent}{key}: {value_text},\n"
-    formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}'
-    return formatted_example
-
-
-def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model",
-                                fields_prefix="Fields", documentation_with_field_description=True) -> str:
-    """
-    Generate text documentation for a list of Pydantic models.
-
-    Args:
-    pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
-    model_prefix (str): Prefix for the model section.
-    fields_prefix (str): Prefix for the fields section.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
-
-    Returns:
-    str: Generated text documentation.
+        str: Generated text documentation.
     """
     documentation = ""
     pyd_models = [(model, True) for model in pydantic_models]
     for model, add_prefix in pyd_models:
         if add_prefix:
-            documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n"
+            documentation += f"{model_prefix}: {model.__name__}\n"
         else:
-            documentation += f"Model: {format_model_and_field_name(model.__name__)}\n"
+            documentation += f"Model: {model.__name__}\n"
 
         # Handling multi-line model description with proper indentation
 
@@ -733,7 +696,7 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr
         class_description = class_doc if class_doc and class_doc != base_class_doc else ""
         if class_description != "":
             documentation += "  Description: "
-            documentation += "\n" + format_multiline_description(class_description, 2) + "\n"
+            documentation += format_multiline_description(class_description, 0) + "\n"
 
         if add_prefix:
             # Indenting the fields section
@@ -753,35 +716,192 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr
                     for element_type in element_types:
                         if isclass(element_type) and issubclass(element_type, BaseModel):
                             pyd_models.append((element_type, False))
-                documentation += generate_field_text(name, field_type, model,
-                                                     documentation_with_field_description=documentation_with_field_description)
+                documentation += generate_field_markdown(
+                    name, field_type, model, documentation_with_field_description=documentation_with_field_description
+                )
             documentation += "\n"
 
-        if hasattr(model, 'Config') and hasattr(model.Config,
-                                                'json_schema_extra') and 'example' in model.Config.json_schema_extra:
+        if hasattr(model, "Config") and hasattr(model.Config,
+                                                "json_schema_extra") and "example" in model.Config.json_schema_extra:
             documentation += f"  Expected Example Output for {format_model_and_field_name(model.__name__)}:\n"
-            json_example = json.dumps(model.Config.json_schema_extra['example'])
+            json_example = json.dumps(model.Config.json_schema_extra["example"])
             documentation += format_multiline_description(json_example, 2) + "\n"
 
     return documentation
 
 
-def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
-                        documentation_with_field_description=True) -> str:
+def generate_field_markdown(
+    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    documentation_with_field_description=True
+) -> str:
+    """
+    Generate markdown documentation for a Pydantic model field.
+
+    Args:
+        field_name (str): Name of the field.
+        field_type (Type[Any]): Type of the field.
+        model (Type[BaseModel]): Pydantic model class.
+        depth (int): Indentation depth in the documentation.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+        str: Generated text documentation for the field.
+    """
+    indent = "    " * depth
+
+    field_info = model.model_fields.get(field_name)
+    field_description = field_info.description if field_info and field_info.description else ""
+
+    if get_origin(field_type) == list:
+        element_type = get_args(field_type)[0]
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    elif get_origin(field_type) == Union:
+        element_types = get_args(field_type)
+        types = []
+        for element_type in element_types:
+            types.append(format_model_and_field_name(element_type.__name__))
+        field_text = f"{indent}{field_name} ({' or '.join(types)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    else:
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+
+    if not documentation_with_field_description:
+        return field_text
+
+    if field_description != "":
+        field_text += f"        Description: " + field_description + "\n"
+
+    # Check for and include field-specific examples if available
+    if hasattr(model, "Config") and hasattr(model.Config,
+                                            "json_schema_extra") and "example" in model.Config.json_schema_extra:
+        field_example = model.Config.json_schema_extra["example"].get(field_name)
+        if field_example is not None:
+            example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
+            field_text += f"{indent}  Example: {example_text}\n"
+
+    if isclass(field_type) and issubclass(field_type, BaseModel):
+        field_text += f"{indent}  Details:\n"
+        for name, type_ in field_type.__annotations__.items():
+            field_text += generate_field_markdown(name, type_, field_type, depth + 2)
+
+    return field_text
+
+
+def format_json_example(example: dict, depth: int) -> str:
+    """
+    Format a JSON example into a readable string with indentation.
+
+    Args:
+        example (dict): JSON example to be formatted.
+        depth (int): Indentation depth.
+
+    Returns:
+        str: Formatted JSON example string.
+    """
+    indent = "    " * depth
+    formatted_example = "{\n"
+    for key, value in example.items():
+        value_text = f"'{value}'" if isinstance(value, str) else value
+        formatted_example += f"{indent}{key}: {value_text},\n"
+    formatted_example = formatted_example.rstrip(",\n") + "\n" + indent + "}"
+    return formatted_example
+
+
+def generate_text_documentation(
+    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    documentation_with_field_description=True
+) -> str:
+    """
+    Generate text documentation for a list of Pydantic models.
+
+    Args:
+        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        model_prefix (str): Prefix for the model section.
+        fields_prefix (str): Prefix for the fields section.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+        str: Generated text documentation.
+    """
+    documentation = ""
+    pyd_models = [(model, True) for model in pydantic_models]
+    for model, add_prefix in pyd_models:
+        if add_prefix:
+            documentation += f"{model_prefix}: {model.__name__}\n"
+        else:
+            documentation += f"Model: {model.__name__}\n"
+
+        # Handling multi-line model description with proper indentation
+
+        class_doc = getdoc(model)
+        base_class_doc = getdoc(BaseModel)
+        class_description = class_doc if class_doc and class_doc != base_class_doc else ""
+        if class_description != "":
+            documentation += "  Description: "
+            documentation += "\n" + format_multiline_description(class_description, 2) + "\n"
+
+        if isclass(model) and issubclass(model, BaseModel):
+            documentation_fields = ""
+            for name, field_type in model.__annotations__.items():
+                # if name == "markdown_code_block":
+                #    continue
+                if get_origin(field_type) == list:
+                    element_type = get_args(field_type)[0]
+                    if isclass(element_type) and issubclass(element_type, BaseModel):
+                        pyd_models.append((element_type, False))
+                if get_origin(field_type) == Union:
+                    element_types = get_args(field_type)
+                    for element_type in element_types:
+                        if isclass(element_type) and issubclass(element_type, BaseModel):
+                            pyd_models.append((element_type, False))
+                documentation_fields += generate_field_text(
+                    name, field_type, model, documentation_with_field_description=documentation_with_field_description
+                )
+            if documentation_fields != "":
+                if add_prefix:
+                    documentation += f"  {fields_prefix}:\n{documentation_fields}"
+                else:
+                    documentation += f"  Fields:\n{documentation_fields}"
+            documentation += "\n"
+
+        if hasattr(model, "Config") and hasattr(model.Config,
+                                                "json_schema_extra") and "example" in model.Config.json_schema_extra:
+            documentation += f"  Expected Example Output for {format_model_and_field_name(model.__name__)}:\n"
+            json_example = json.dumps(model.Config.json_schema_extra["example"])
+            documentation += format_multiline_description(json_example, 2) + "\n"
+
+    return documentation
+
+
+def generate_field_text(
+    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    documentation_with_field_description=True
+) -> str:
     """
     Generate text documentation for a Pydantic model field.
 
     Args:
-    field_name (str): Name of the field.
-    field_type (Type[Any]): Type of the field.
-    model (Type[BaseModel]): Pydantic model class.
-    depth (int): Indentation depth in the documentation.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        field_name (str): Name of the field.
+        field_type (Type[Any]): Type of the field.
+        model (Type[BaseModel]): Pydantic model class.
+        depth (int): Indentation depth in the documentation.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    str: Generated text documentation for the field.
+        str: Generated text documentation for the field.
     """
-    indent = '    ' * depth
+    indent = "    " * depth
 
     field_info = model.model_fields.get(field_name)
     field_description = field_info.description if field_info and field_info.description else ""
@@ -817,9 +937,9 @@ def generate_field_text(field_name: str, field_type: Type[Any], model: Type[Base
         field_text += f"{indent}  Description: " + field_description + "\n"
 
     # Check for and include field-specific examples if available
-    if hasattr(model, 'Config') and hasattr(model.Config,
-                                            'json_schema_extra') and 'example' in model.Config.json_schema_extra:
-        field_example = model.Config.json_schema_extra['example'].get(field_name)
+    if hasattr(model, "Config") and hasattr(model.Config,
+                                            "json_schema_extra") and "example" in model.Config.json_schema_extra:
+        field_example = model.Config.json_schema_extra["example"].get(field_name)
         if field_example is not None:
             example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
             field_text += f"{indent}  Example: {example_text}\n"
@@ -837,39 +957,40 @@ def format_multiline_description(description: str, indent_level: int) -> str:
     Format a multiline description with proper indentation.
 
     Args:
-    description (str): Multiline description.
-    indent_level (int): Indentation level.
+        description (str): Multiline description.
+        indent_level (int): Indentation level.
 
     Returns:
-    str: Formatted multiline description.
+        str: Formatted multiline description.
     """
-    indent = '    ' * indent_level
-    return indent + description.replace('\n', '\n' + indent)
+    indent = "    " * indent_level
+    return indent + description.replace("\n", "\n" + indent)
 
 
-def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf",
-                                        documentation_file_path="./grammar_documentation.md"):
+def save_gbnf_grammar_and_documentation(
+    grammar, documentation, grammar_file_path="./grammar.gbnf", documentation_file_path="./grammar_documentation.md"
+):
     """
     Save GBNF grammar and documentation to specified files.
 
     Args:
-    grammar (str): GBNF grammar string.
-    documentation (str): Documentation string.
-    grammar_file_path (str): File path to save the GBNF grammar.
-    documentation_file_path (str): File path to save the documentation.
+        grammar (str): GBNF grammar string.
+        documentation (str): Documentation string.
+        grammar_file_path (str): File path to save the GBNF grammar.
+        documentation_file_path (str): File path to save the documentation.
 
     Returns:
-    None
+        None
     """
     try:
-        with open(grammar_file_path, 'w') as file:
+        with open(grammar_file_path, "w") as file:
             file.write(grammar + get_primitive_grammar(grammar))
         print(f"Grammar successfully saved to {grammar_file_path}")
     except IOError as e:
         print(f"An error occurred while saving the grammar file: {e}")
 
     try:
-        with open(documentation_file_path, 'w') as file:
+        with open(documentation_file_path, "w") as file:
             file.write(documentation)
         print(f"Documentation successfully saved to {documentation_file_path}")
     except IOError as e:
@@ -881,10 +1002,10 @@ def remove_empty_lines(string):
     Remove empty lines from a string.
 
     Args:
-    string (str): Input string.
+        string (str): Input string.
 
     Returns:
-    str: String with empty lines removed.
+        str: String with empty lines removed.
     """
     lines = string.splitlines()
     non_empty_lines = [line for line in lines if line.strip() != ""]
@@ -892,95 +1013,109 @@ def remove_empty_lines(string):
     return string_no_empty_lines
 
 
-def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list,
-                                                     grammar_file_path="./generated_grammar.gbnf",
-                                                     documentation_file_path="./generated_grammar_documentation.md",
-                                                     outer_object_name: str = None,
-                                                     outer_object_content: str = None,
-                                                     model_prefix: str = "Output Model",
-                                                     fields_prefix: str = "Output Fields",
-                                                     list_of_outputs: bool = False,
-                                                     documentation_with_field_description=True):
+def generate_and_save_gbnf_grammar_and_documentation(
+    pydantic_model_list,
+    grammar_file_path="./generated_grammar.gbnf",
+    documentation_file_path="./generated_grammar_documentation.md",
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation, and save them to specified files.
 
     Args:
-    pydantic_model_list: List of Pydantic model classes.
-    grammar_file_path (str): File path to save the generated GBNF grammar.
-    documentation_file_path (str): File path to save the generated documentation.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        pydantic_model_list: List of Pydantic model classes.
+        grammar_file_path (str): File path to save the generated GBNF grammar.
+        documentation_file_path (str): File path to save the generated documentation.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    None
+        None
     """
-    documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        pydantic_model_list, model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar)
     save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path)
 
 
-def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None,
-                                            outer_object_content: str = None,
-                                            model_prefix: str = "Output Model",
-                                            fields_prefix: str = "Output Fields", list_of_outputs: bool = False,
-                                            documentation_with_field_description=True):
+def generate_gbnf_grammar_and_documentation(
+    pydantic_model_list,
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation for a list of Pydantic models.
 
     Args:
-    pydantic_model_list: List of Pydantic model classes.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        pydantic_model_list: List of Pydantic model classes.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    tuple: GBNF grammar string, documentation string.
+        tuple: GBNF grammar string, documentation string.
     """
-    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        copy(pydantic_model_list), model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
     return grammar, documentation
 
 
-def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict],
-                                                              outer_object_name: str = None,
-                                                              outer_object_content: str = None,
-                                                              model_prefix: str = "Output Model",
-                                                              fields_prefix: str = "Output Fields",
-                                                              list_of_outputs: bool = False,
-                                                              documentation_with_field_description=True):
+def generate_gbnf_grammar_and_documentation_from_dictionaries(
+    dictionaries: List[dict],
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation from a list of dictionaries.
 
     Args:
-    dictionaries (List[dict]): List of dictionaries representing Pydantic models.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    tuple: GBNF grammar string, documentation string.
+        tuple: GBNF grammar string, documentation string.
     """
     pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries)
-    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        copy(pydantic_model_list), model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
     return grammar, documentation
 
@@ -990,41 +1125,61 @@ def create_dynamic_model_from_function(func: Callable):
     Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
 
     Args:
-    func (Callable): A function with type hints from which to create the model.
+        func (Callable): A function with type hints from which to create the model.
 
     Returns:
-    A dynamic Pydantic model class with the provided function as a 'run' method.
+        A dynamic Pydantic model class with the provided function as a 'run' method.
     """
-    # Extracting type hints from the provided function
-    type_hints = get_type_hints(func)
-    type_hints.pop('return', None)
 
-    # Handling default values and annotations
+    # Get the signature of the function
+    sig = inspect.signature(func)
+
+    # Parse the docstring
+    docstring = parse(func.__doc__)
+
     dynamic_fields = {}
-    defaults = getattr(func, '__defaults__', ()) or ()
-    defaults_index = len(type_hints) - len(defaults)
+    param_docs = []
+    for param in sig.parameters.values():
+        # Exclude 'self' parameter
+        if param.name == "self":
+            continue
 
-    for index, (name, typ) in enumerate(type_hints.items()):
-        if index >= defaults_index:
-            default_value = defaults[index - defaults_index]
-            dynamic_fields[name] = (typ, default_value)
+        # Assert that the parameter has a type annotation
+        if param.annotation == inspect.Parameter.empty:
+            raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
+
+        # Find the parameter's description in the docstring
+        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
+
+        # Assert that the parameter has a description
+        if not param_doc or not param_doc.description:
+            raise ValueError(
+                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
+
+        # Add parameter details to the schema
+        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
+        param_docs.append((param.name, param_doc))
+        if param.default == inspect.Parameter.empty:
+            default_value = ...
         else:
-            dynamic_fields[name] = (typ, ...)
-
+            default_value = param.default
+        dynamic_fields[param.name] = (
+            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamicModel = create_model(f'{func.__name__}', **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
 
-    dynamicModel.__doc__ = getdoc(func)
+    for param_doc in param_docs:
+        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
+
+    dynamic_model.__doc__ = docstring.short_description
 
-    # Wrapping the original function to handle instance 'self'
     def run_method_wrapper(self):
-        func_args = {name: getattr(self, name) for name in type_hints}
+        func_args = {name: getattr(self, name) for name, _ in dynamic_fields.items()}
         return func(**func_args)
 
     # Adding the wrapped function as a 'run' method
-    setattr(dynamicModel, 'run', run_method_wrapper)
-
-    return dynamicModel
+    setattr(dynamic_model, "run", run_method_wrapper)
+    return dynamic_model
 
 
 def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
@@ -1032,11 +1187,11 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
     Add a 'run' method to a dynamic Pydantic model, using the provided function.
 
     Args:
-    - model (Type[BaseModel]): Dynamic Pydantic model class.
-    - func (Callable): Function to be added as a 'run' method to the model.
+        model (Type[BaseModel]): Dynamic Pydantic model class.
+        func (Callable): Function to be added as a 'run' method to the model.
 
     Returns:
-    - Type[BaseModel]: Pydantic model class with the added 'run' method.
+        Type[BaseModel]: Pydantic model class with the added 'run' method.
     """
 
     def run_method_wrapper(self):
@@ -1044,7 +1199,7 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
         return func(**func_args)
 
     # Adding the wrapped function as a 'run' method
-    setattr(model, 'run', run_method_wrapper)
+    setattr(model, "run", run_method_wrapper)
 
     return model
 
@@ -1054,15 +1209,15 @@ def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
     Create a list of dynamic Pydantic model classes from a list of dictionaries.
 
     Args:
-    - dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (List[dict]): List of dictionaries representing model structures.
 
     Returns:
-    - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
     """
     dynamic_models = []
     for func in dictionaries:
         model_name = format_model_and_field_name(func.get("name", ""))
-        dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name)
+        dyn_model = convert_dictionary_to_pydantic_model(func, model_name)
         dynamic_models.append(dyn_model)
     return dynamic_models
 
@@ -1080,12 +1235,12 @@ from enum import Enum
 
 def json_schema_to_python_types(schema):
     type_map = {
-        'any': Any,
-        'string': str,
-        'number': float,
-        'integer': int,
-        'boolean': bool,
-        'array': list,
+        "any": Any,
+        "string": str,
+        "number": float,
+        "integer": int,
+        "boolean": bool,
+        "array": list,
     }
     return type_map[schema]
 
@@ -1094,58 +1249,64 @@ def list_to_enum(enum_name, values):
     return Enum(enum_name, {value: value for value in values})
 
 
-def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
     """
     Convert a dictionary to a Pydantic model class.
 
     Args:
-    - dictionary (dict): Dictionary representing the model structure.
-    - model_name (str): Name of the generated Pydantic model.
+        dictionary (dict): Dictionary representing the model structure.
+        model_name (str): Name of the generated Pydantic model.
 
     Returns:
-    - Type[BaseModel]: Generated Pydantic model class.
+        Type[BaseModel]: Generated Pydantic model class.
     """
     fields = {}
 
     if "properties" in dictionary:
         for field_name, field_data in dictionary.get("properties", {}).items():
-            if field_data == 'object':
-                submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}')
+            if field_data == "object":
+                submodel = convert_dictionary_to_pydantic_model(dictionary, f"{model_name}_{field_name}")
                 fields[field_name] = (submodel, ...)
             else:
-                field_type = field_data.get('type', 'str')
+                field_type = field_data.get("type", "str")
 
                 if field_data.get("enum", []):
                     fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...)
-                if field_type == "array":
+                elif field_type == "array":
                     items = field_data.get("items", {})
                     if items != {}:
                         array = {"properties": items}
-                        array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items')
+                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
                         fields[field_name] = (List[array_type], ...)
                     else:
                         fields[field_name] = (list, ...)
-                elif field_type == 'object':
-                    submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}')
+                elif field_type == "object":
+                    submodel = convert_dictionary_to_pydantic_model(field_data, f"{model_name}_{field_name}")
                     fields[field_name] = (submodel, ...)
+                elif field_type == "required":
+                    required = field_data.get("enum", [])
+                    for key, field in fields.items():
+                        if key not in required:
+                            fields[key] = (Optional[fields[key][0]], ...)
                 else:
                     field_type = json_schema_to_python_types(field_type)
                     fields[field_name] = (field_type, ...)
     if "function" in dictionary:
-
         for field_name, field_data in dictionary.get("function", {}).items():
             if field_name == "name":
                 model_name = field_data
             elif field_name == "description":
                 fields["__doc__"] = field_data
             elif field_name == "parameters":
-                return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
+                return convert_dictionary_to_pydantic_model(field_data, f"{model_name}")
+
     if "parameters" in dictionary:
         field_data = {"function": dictionary}
-        return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
-
+        return convert_dictionary_to_pydantic_model(field_data, f"{model_name}")
+    if "required" in dictionary:
+        required = dictionary.get("required", [])
+        for key, field in fields.items():
+            if key not in required:
+                fields[key] = (Optional[fields[key][0]], ...)
     custom_model = create_model(model_name, **fields)
     return custom_model
-
-
-

From 7c8d3abd1a17c28fc56b1a4814bc4b29f91d7454 Mon Sep 17 00:00:00 2001
From: Alex Azarov <alex@azarov.by>
Date: Tue, 16 Jan 2024 14:33:02 +0100
Subject: [PATCH 041/138] metal : log `recommendedMaxWorkingSetSize` on iOS 16+
 (#4936)

* metal: Log `recommendedMaxWorkingSetSize` on iOS 16+

* Only log on iOS and macOS, ignoring tvOS and other platforms

* Check for Xcode version before using recommendedMaxWorkingSetSize

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m | 58 ++++++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 867f2fd48..44134d1d9 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -369,8 +369,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+    }
+#elif TARGET_OS_OSX
     if (ctx->device.maxTransferRate != 0) {
         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
     } else {
@@ -2369,6 +2373,25 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe
     UNUSED(buft);
 }
 
+static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+                device.currentAllocatedSize / 1024.0 / 1024.0,
+                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        } else {
+            GGML_METAL_LOG_INFO("\n");
+        }
+    } else {
+        GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+    }
+#endif
+    UNUSED(device);
+}
+
 GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
@@ -2401,22 +2424,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
     }
 
     GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-
-
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-
+    ggml_backend_metal_log_allocated_size(device);
 
     return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }
@@ -2524,19 +2532,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
         }
     }
 
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
+    ggml_backend_metal_log_allocated_size(device);
 
     return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
 }

From 3a48d558a69c88ac17efcaa5900cd9eb19596ac4 Mon Sep 17 00:00:00 2001
From: Alex Azarov <alex@azarov.by>
Date: Tue, 16 Jan 2024 14:41:27 +0100
Subject: [PATCH 042/138] metal : replace loop of dispatch_async with
 dispatch_apply (#4934)

* Replace loop of dispatch_async with dispatch_apply

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m | 2882 +++++++++++++++++++++++++-------------------------
 1 file changed, 1439 insertions(+), 1443 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 44134d1d9..c21dc465a 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -737,1475 +737,249 @@ static bool ggml_metal_graph_compute(
         ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
     }
 
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
+        const int cb_idx = iter;
 
-        dispatch_async(ctx->d_queue, ^{
-            size_t offs_src0 = 0;
-            size_t offs_src1 = 0;
-            size_t offs_dst  = 0;
+        size_t offs_src0 = 0;
+        size_t offs_src1 = 0;
+        size_t offs_dst  = 0;
 
-            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
+        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
 
-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
+        const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+        const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
 
-            for (int ind = node_start; ind < node_end; ++ind) {
-                const int i = ind;
+        for (int ind = node_start; ind < node_end; ++ind) {
+            const int i = ind;
 
-                if (i == -1) {
-                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
-                    continue;
-                }
+            if (i == -1) {
+                [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+                continue;
+            }
 
-                //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+            //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
-                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-                struct ggml_tensor * dst  = gf->nodes[i];
+            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * dst  = gf->nodes[i];
 
-                switch (dst->op) {
-                    case GGML_OP_NONE:
-                    case GGML_OP_RESHAPE:
-                    case GGML_OP_VIEW:
-                    case GGML_OP_TRANSPOSE:
-                    case GGML_OP_PERMUTE:
-                        {
-                            // noop -> next node
-                        } continue;
-                    default:
-                        {
-                        } break;
-                }
+            switch (dst->op) {
+                case GGML_OP_NONE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop -> next node
+                    } continue;
+                default:
+                    {
+                    } break;
+            }
 
-                if (!ggml_metal_supports_op(ctx, dst)) {
-                    GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                    GGML_ASSERT(!"unsupported op");
-                }
+            if (!ggml_metal_supports_op(ctx, dst)) {
+                GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+                GGML_ASSERT(!"unsupported op");
+            }
 
 #ifndef GGML_METAL_NDEBUG
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
+            [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
 #endif
 
-                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-                const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+            const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+            const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+            const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+            const int64_t  ne03 = src0 ? src0->ne[3] : 0;
 
-                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+            const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint64_t nb03 = src0 ? src0->nb[3] : 0;
 
-                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+            const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+            const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+            const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+            const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
 
-                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+            const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+            const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
 
-                const int64_t  ne0  = dst ? dst->ne[0] : 0;
-                const int64_t  ne1  = dst ? dst->ne[1] : 0;
-                const int64_t  ne2  = dst ? dst->ne[2] : 0;
-                const int64_t  ne3  = dst ? dst->ne[3] : 0;
+            const int64_t  ne0  = dst ? dst->ne[0] : 0;
+            const int64_t  ne1  = dst ? dst->ne[1] : 0;
+            const int64_t  ne2  = dst ? dst->ne[2] : 0;
+            const int64_t  ne3  = dst ? dst->ne[3] : 0;
 
-                const uint64_t nb0  = dst ? dst->nb[0] : 0;
-                const uint64_t nb1  = dst ? dst->nb[1] : 0;
-                const uint64_t nb2  = dst ? dst->nb[2] : 0;
-                const uint64_t nb3  = dst ? dst->nb[3] : 0;
+            const uint64_t nb0  = dst ? dst->nb[0] : 0;
+            const uint64_t nb1  = dst ? dst->nb[1] : 0;
+            const uint64_t nb2  = dst ? dst->nb[2] : 0;
+            const uint64_t nb3  = dst ? dst->nb[3] : 0;
 
-                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
 
-                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
 
-                //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-                //if (src0) {
-                //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-                //            ggml_is_contiguous(src0), src0->name);
-                //}
-                //if (src1) {
-                //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-                //            ggml_is_contiguous(src1), src1->name);
-                //}
-                //if (dst) {
-                //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-                //            dst->name);
-                //}
+            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+            //if (src0) {
+            //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+            //            ggml_is_contiguous(src0), src0->name);
+            //}
+            //if (src1) {
+            //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+            //            ggml_is_contiguous(src1), src1->name);
+            //}
+            //if (dst) {
+            //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+            //            dst->name);
+            //}
 
-                switch (dst->op) {
-                    case GGML_OP_CONCAT:
-                        {
-                            const int64_t nb = ne00;
+            switch (dst->op) {
+                case GGML_OP_CONCAT:
+                    {
+                        const int64_t nb = ne00;
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
 
-                            const int nth = MIN(1024, ne0);
+                        const int nth = MIN(1024, ne0);
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ADD:
-                    case GGML_OP_MUL:
-                    case GGML_OP_DIV:
-                        {
-                            const size_t offs = 0;
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ADD:
+                case GGML_OP_MUL:
+                case GGML_OP_DIV:
+                    {
+                        const size_t offs = 0;
 
-                            bool bcast_row = false;
+                        bool bcast_row = false;
 
-                            int64_t nb = ne00;
+                        int64_t nb = ne00;
 
-                            id<MTLComputePipelineState> pipeline = nil;
+                        id<MTLComputePipelineState> pipeline = nil;
 
-                            if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                                GGML_ASSERT(ggml_is_contiguous(src0));
-
-                                // src1 is a row
-                                GGML_ASSERT(ne11 == 1);
-
-                                nb = ne00 / 4;
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
-
-                                bcast_row = true;
-                            } else {
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
-
-                            if (bcast_row) {
-                                const int64_t n = ggml_nelements(dst)/4;
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } else {
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_ACC:
-                        {
-                            GGML_ASSERT(src0t == GGML_TYPE_F32);
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                            GGML_ASSERT(dstt  == GGML_TYPE_F32);
-
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-                            GGML_ASSERT(ggml_is_contiguous(src1));
-
-                            const size_t pnb1 = ((int32_t *) dst->op_params)[0];
-                            const size_t pnb2 = ((int32_t *) dst->op_params)[1];
-                            const size_t pnb3 = ((int32_t *) dst->op_params)[2];
-                            const size_t offs = ((int32_t *) dst->op_params)[3];
-
-                            const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                            if (!inplace) {
-                                // run a separete kernel to cpy src->dst
-                                // not sure how to avoid this
-                                // TODO: make a simpler cpy_bytes kernel
-
-                                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                            }
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_SCALE:
-                        {
+                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                             GGML_ASSERT(ggml_is_contiguous(src0));
 
-                            const float scale = *(const float *) dst->op_params;
+                            // src1 is a row
+                            GGML_ASSERT(ne11 == 1);
 
-                            int64_t n = ggml_nelements(dst);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (n % 4 == 0) {
-                                n /= 4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                            nb = ne00 / 4;
+                            switch (dst->op) {
+                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
+                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
+                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
+                                default: GGML_ASSERT(false);
                             }
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                            bcast_row = true;
+                        } else {
+                            switch (dst->op) {
+                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
+                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
+                                default: GGML_ASSERT(false);
+                            }
+                        }
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
+
+                        if (bcast_row) {
+                            const int64_t n = ggml_nelements(dst)/4;
 
                             [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_UNARY:
-                        switch (ggml_get_unary_op(gf->nodes[i])) {
-                            case GGML_UNARY_OP_TANH:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_RELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU_QUICK:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_SILU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            default:
-                                {
-                                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ASSERT(false);
-                                }
-                        } break;
-                    case GGML_OP_SQR:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SUM_ROWS:
-                        {
-                            GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SOFT_MAX:
-                        {
-                            int nth = 32; // SIMD width
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ne00%4 == 0) {
-                                while (nth < ne00/4 && nth < 256) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
-                            } else {
-                                while (nth < ne00 && nth < 1024) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
-                            }
-
-                            const float scale = ((float *) dst->op_params)[0];
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            if (id_src1) {
-                                [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                            } else {
-                                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
-                            }
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
-                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
-                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
-                            [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_DIAG_MASK_INF:
-                        {
-                            const int n_past = ((int32_t *)(dst->op_params))[0];
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ne00%8 == 0) {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
-
-                            if (ne00%8 == 0) {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                            else {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT:
-                        {
-                            GGML_ASSERT(ne00 == ne10);
-
-                            // TODO: assert that dim2 and dim3 are contiguous
-                            GGML_ASSERT(ne12 % ne02 == 0);
-                            GGML_ASSERT(ne13 % ne03 == 0);
-
-                            const uint r2 = ne12/ne02;
-                            const uint r3 = ne13/ne03;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
-
-#if 0
-                            // the numbers below are measured on M2 Ultra for 7B and 13B models
-                            // these numbers do not translate to other devices or model sizes
-                            // TODO: need to find a better approach
-                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                                switch (src0t) {
-                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q4_0:
-                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                    case GGML_TYPE_Q5_0:                          // not tested yet
-                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                    default:             ne11_mm_min = 1;  break;
-                                }
-                            }
-#endif
-
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
-                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src0->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
-                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src0t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
-                                            nrows = 4;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            if (src1t == GGML_TYPE_F32) {
-                                                if (ne11 * ne12 < 4) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                    nrows = ne11;
-                                                } else {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                    nrows = 4;
-                                                }
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
-
-                                if (ggml_is_quantized(src0t)) {
-                                    GGML_ASSERT(ne00 >= nth0*nth1);
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
-
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
-                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT_ID:
-                        {
-                            //GGML_ASSERT(ne00 == ne10);
-                            //GGML_ASSERT(ne03 == ne13);
-
-                            GGML_ASSERT(src0t == GGML_TYPE_I32);
-
-                            const int n_as = ((int32_t *) dst->op_params)[1];
-
-                            // TODO: make this more general
-                            GGML_ASSERT(n_as <= 8);
-
-                            // max size of the src1ids array in the kernel stack
-                            GGML_ASSERT(ne11 <= 512);
-
-                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
-
-                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-                            const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-                            const int64_t  ne22 = src2 ? src2->ne[2] : 0;
-                            const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
-
-                            const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-                            const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-                            const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-                            const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
-
-                            const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
-
-                            GGML_ASSERT(!ggml_is_transposed(src2));
-                            GGML_ASSERT(!ggml_is_transposed(src1));
-
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                            const uint r2 = ne12/ne22;
-                            const uint r3 = ne13/ne23;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = n_as;
-
-                            const int idx = ((int32_t *) dst->op_params)[0];
-
-                            // batch size
-                            GGML_ASSERT(ne01 == ne11);
-
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            // !!!
-                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
-                            //       indirect matrix multiplication
-                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne20 % 32 == 0 && ne20 >= 64 &&
-                                ne11 > ne11_mm_min) {
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src2->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                                [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
-                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
-                                }
-
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src2t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
-
-                                if (ggml_is_quantized(src2t)) {
-                                    GGML_ASSERT(ne20 >= nth0*nth1);
-                                }
-
-                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                                [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
-                                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
-                                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
-                                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
-                                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
-                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
-                                }
-
-                                if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
-                                    src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
-                                    src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src2t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_GET_ROWS:
-                        {
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
-                                case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                                default: GGML_ASSERT(false && "not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
-                            [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
-                        } break;
-                    case GGML_OP_RMS_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            int nth = 32; // SIMD width
-
-                            while (nth < ne00/4 && nth < 1024) {
-                                nth *= 2;
-                            }
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            const int64_t nrows = ggml_nrows(src0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_GROUP_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            //float eps;
-                            //memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const float eps = 1e-6f; // TODO: temporarily hardcoded
-
-                            const int32_t n_groups = ((int32_t *) dst->op_params)[0];
-
-                            int nth = 32; // SIMD width
-
-                            //while (nth < ne00/4 && nth < 1024) {
-                            //    nth *= 2;
-                            //}
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_NORM:
-                        {
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const int nth = MIN(256, ne00);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
-
-                            const int64_t nrows = ggml_nrows(src0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ALIBI:
-                        {
-                            GGML_ASSERT((src0t == GGML_TYPE_F32));
-
-                            const int nth = MIN(1024, ne00);
-
-                            //const int n_past = ((int32_t *) dst->op_params)[0];
-                            const int n_head = ((int32_t *) dst->op_params)[1];
-                            float max_bias;
-                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ROPE:
-                        {
-                            GGML_ASSERT(ne10 == ne02);
-
-                            const int nth = MIN(1024, ne00);
-
-                            const int n_past     = ((int32_t *) dst->op_params)[0];
-                            const int n_dims     = ((int32_t *) dst->op_params)[1];
-                            const int mode       = ((int32_t *) dst->op_params)[2];
-                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_IM2COL:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F16);
-                            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
-                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
-                            const int32_t IH = is_2D ? src1->ne[1] : 1;
-                            const int32_t IW =         src1->ne[0];
-
-                            const int32_t KH = is_2D ? src0->ne[1] : 1;
-                            const int32_t KW =         src0->ne[0];
-
-                            const int32_t OH = is_2D ? dst->ne[2] : 1;
-                            const int32_t OW =         dst->ne[1];
-
-                            const int32_t CHW = IC * KH * KW;
-
-                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
-                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
-                        } break;
-                    case GGML_OP_UPSCALE:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            const int sf = dst->op_params[0];
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
-
+                        } else {
                             const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_PAD:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        }
+                    } break;
+                case GGML_OP_ACC:
+                    {
+                        GGML_ASSERT(src0t == GGML_TYPE_F32);
+                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                        GGML_ASSERT(dstt  == GGML_TYPE_F32);
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+                        GGML_ASSERT(ggml_is_contiguous(src1));
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                        const size_t pnb1 = ((int32_t *) dst->op_params)[0];
+                        const size_t pnb2 = ((int32_t *) dst->op_params)[1];
+                        const size_t pnb3 = ((int32_t *) dst->op_params)[2];
+                        const size_t offs = ((int32_t *) dst->op_params)[3];
 
-                            const int nth = MIN(1024, ne0);
+                        const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ARGSORT:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_I32);
+                        if (!inplace) {
+                            // run a separete kernel to cpy src->dst
+                            // not sure how to avoid this
+                            // TODO: make a simpler cpy_bytes kernel
 
-                            const int nrows = ggml_nrows(src0);
-
-                            enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (order) {
-                                case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                                case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
-                        } break;
-                    case GGML_OP_LEAKY_RELU:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            float slope;
-                            memcpy(&slope, dst->op_params, sizeof(float));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
-                            [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_DUP:
-                    case GGML_OP_CPY:
-                    case GGML_OP_CONT:
-                        {
-                            GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-
-                            int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
-
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
-                                            case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
-                                            case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
-                                            case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
-                                            case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
-                                          //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
-                                          //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
-                                    } break;
-                                default: GGML_ASSERT(false && "not implemented");
-                            }
+                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
 
                             [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
@@ -2227,31 +1001,1253 @@ static bool ggml_metal_graph_compute(
                             [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
                             [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
 
+                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    default:
-                        {
-                            GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                            GGML_ASSERT(false);
                         }
-                }
+
+                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
+                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
+                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
+                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
+                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
+                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+
+                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+
+                        const float scale = *(const float *) dst->op_params;
+
+                        int64_t n = ggml_nelements(dst);
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (n % 4 == 0) {
+                            n /= 4;
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
+                        } else {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                        }
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_UNARY:
+                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        case GGML_UNARY_OP_TANH:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_GELU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_GELU_QUICK:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        default:
+                            {
+                                GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                GGML_ASSERT(false);
+                            }
+                    } break;
+                case GGML_OP_SQR:
+                    {
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                        const int64_t n = ggml_nelements(dst);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_SUM_ROWS:
+                    {
+                        GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        int nth = 32; // SIMD width
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (ne00%4 == 0) {
+                            while (nth < ne00/4 && nth < 256) {
+                                nth *= 2;
+                            }
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
+                        } else {
+                            while (nth < ne00 && nth < 1024) {
+                                nth *= 2;
+                            }
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
+                        }
+
+                        const float scale = ((float *) dst->op_params)[0];
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                        if (id_src1) {
+                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                        } else {
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                        }
+                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                        [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                        [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                        [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                        [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        const int n_past = ((int32_t *)(dst->op_params))[0];
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (ne00%8 == 0) {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
+                        } else {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
+                        }
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                        if (ne00%8 == 0) {
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        }
+                        else {
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        GGML_ASSERT(ne00 == ne10);
+
+                        // TODO: assert that dim2 and dim3 are contiguous
+                        GGML_ASSERT(ne12 % ne02 == 0);
+                        GGML_ASSERT(ne13 % ne03 == 0);
+
+                        const uint r2 = ne12/ne02;
+                        const uint r3 = ne13/ne03;
+
+                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                        // to the matrix-vector kernel
+                        int ne11_mm_min = 1;
+
+#if 0
+                        // the numbers below are measured on M2 Ultra for 7B and 13B models
+                        // these numbers do not translate to other devices or model sizes
+                        // TODO: need to find a better approach
+                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+                            switch (src0t) {
+                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q4_0:
+                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+                                case GGML_TYPE_Q5_0:                          // not tested yet
+                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+                                default:             ne11_mm_min = 1;  break;
+                            }
+                        }
+#endif
+
+                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                            !ggml_is_transposed(src0) &&
+                            !ggml_is_transposed(src1) &&
+                            src1t == GGML_TYPE_F32 &&
+                            ne00 % 32 == 0 && ne00 >= 64 &&
+                            (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                            id<MTLComputePipelineState> pipeline = nil;
+
+                            switch (src0->type) {
+                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                            }
+
+                            [encoder setComputePipelineState:pipeline];
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
+                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
+                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                        } else {
+                            int nth0 = 32;
+                            int nth1 = 1;
+                            int nrows = 1;
+                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                            id<MTLComputePipelineState> pipeline = nil;
+
+                            // use custom matrix x vector kernel
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                        nrows = 4;
+                                    } break;
+                                case GGML_TYPE_F16:
+                                    {
+                                        nth0 = 32;
+                                        nth1 = 1;
+                                        if (src1t == GGML_TYPE_F32) {
+                                            if (ne11 * ne12 < 4) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
+                                                nrows = ne11;
+                                            } else {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
+                                                nrows = 4;
+                                            }
+                                        } else {
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
+                                            nrows = 4;
+                                        }
+                                    } break;
+                                case GGML_TYPE_Q4_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q8_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q2_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q3_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_K:
+                                    {
+                                        nth0 = 4; //1;
+                                        nth1 = 8; //32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q6_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
+                                    } break;
+                                default:
+                                    {
+                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                        GGML_ASSERT(false && "not implemented");
+                                    }
+                            };
+
+                            if (ggml_is_quantized(src0t)) {
+                                GGML_ASSERT(ne00 >= nth0*nth1);
+                            }
+
+                            [encoder setComputePipelineState:pipeline];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
+
+                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
+                                src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                            }
+                            else if (src0t == GGML_TYPE_Q5_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q6_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            } else {
+                                const int64_t ny = (ne11 + nrows - 1)/nrows;
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT_ID:
+                    {
+                        //GGML_ASSERT(ne00 == ne10);
+                        //GGML_ASSERT(ne03 == ne13);
+
+                        GGML_ASSERT(src0t == GGML_TYPE_I32);
+
+                        const int n_as = ((int32_t *) dst->op_params)[1];
+
+                        // TODO: make this more general
+                        GGML_ASSERT(n_as <= 8);
+
+                        // max size of the src1ids array in the kernel stack
+                        GGML_ASSERT(ne11 <= 512);
+
+                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
+
+                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
+                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
+                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
+                        const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
+
+                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+                        const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
+
+                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
+
+                        GGML_ASSERT(!ggml_is_transposed(src2));
+                        GGML_ASSERT(!ggml_is_transposed(src1));
+
+                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                        const uint r2 = ne12/ne22;
+                        const uint r3 = ne13/ne23;
+
+                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                        // to the matrix-vector kernel
+                        int ne11_mm_min = n_as;
+
+                        const int idx = ((int32_t *) dst->op_params)[0];
+
+                        // batch size
+                        GGML_ASSERT(ne01 == ne11);
+
+                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                        // !!!
+                        // TODO: for now, always use mat-vec kernels until we figure out how to improve the
+                        //       indirect matrix multiplication
+                        // !!!
+                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                            ne20 % 32 == 0 && ne20 >= 64 &&
+                            ne11 > ne11_mm_min) {
+
+                            id<MTLComputePipelineState> pipeline = nil;
+
+                            switch (src2->type) {
+                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
+                            }
+
+                            [encoder setComputePipelineState:pipeline];
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
+                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
+                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
+                            [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
+                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
+                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
+                            // TODO: how to make this an array? read Metal docs
+                            for (int j = 0; j < 8; ++j) {
+                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
+
+                                size_t offs_src_cur = 0;
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+
+                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
+                            }
+
+                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                        } else {
+                            int nth0 = 32;
+                            int nth1 = 1;
+                            int nrows = 1;
+                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                            id<MTLComputePipelineState> pipeline = nil;
+
+                            // use custom matrix x vector kernel
+                            switch (src2t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_F16:
+                                    {
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        nth0 = 32;
+                                        nth1 = 1;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q8_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q2_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q3_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_K:
+                                    {
+                                        nth0 = 4; //1;
+                                        nth1 = 8; //32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q6_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
+                                    } break;
+                                default:
+                                    {
+                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                        GGML_ASSERT(false && "not implemented");
+                                    }
+                            };
+
+                            if (ggml_is_quantized(src2t)) {
+                                GGML_ASSERT(ne20 >= nth0*nth1);
+                            }
+
+                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
+
+                            [encoder setComputePipelineState:pipeline];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
+                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
+                            [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
+                            [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
+                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
+                            [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
+                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
+                            // TODO: how to make this an array? read Metal docs
+                            for (int j = 0; j < 8; ++j) {
+                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
+
+                                size_t offs_src_cur = 0;
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+
+                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
+                            }
+
+                            if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
+                                src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
+                                src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
+                                const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                            }
+                            else if (src2t == GGML_TYPE_Q5_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q6_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            } else {
+                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                        }
+                    } break;
+                case GGML_OP_GET_ROWS:
+                    {
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
+                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
+                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
+                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
+                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
+                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
+                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
+                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
+                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
+                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
+                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
+                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        }
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
+                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
+                        [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
+                        [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
+                        [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
+                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+
+                        int nth = 32; // SIMD width
+
+                        while (nth < ne00/4 && nth < 1024) {
+                            nth *= 2;
+                        }
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        const int64_t nrows = ggml_nrows(src0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_GROUP_NORM:
+                    {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
+                        //float eps;
+                        //memcpy(&eps, dst->op_params, sizeof(float));
+
+                        const float eps = 1e-6f; // TODO: temporarily hardcoded
+
+                        const int32_t n_groups = ((int32_t *) dst->op_params)[0];
+
+                        int nth = 32; // SIMD width
+
+                        //while (nth < ne00/4 && nth < 1024) {
+                        //    nth *= 2;
+                        //}
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
+                        [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
+                        [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_NORM:
+                    {
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+
+                        const int nth = MIN(256, ne00);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                        [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+
+                        const int64_t nrows = ggml_nrows(src0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ALIBI:
+                    {
+                        GGML_ASSERT((src0t == GGML_TYPE_F32));
+
+                        const int nth = MIN(1024, ne00);
+
+                        //const int n_past = ((int32_t *) dst->op_params)[0];
+                        const int n_head = ((int32_t *) dst->op_params)[1];
+                        float max_bias;
+                        memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+                        const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+                        const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                        [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
+                        [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
+                        [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ROPE:
+                    {
+                        GGML_ASSERT(ne10 == ne02);
+
+                        const int nth = MIN(1024, ne00);
+
+                        const int n_past     = ((int32_t *) dst->op_params)[0];
+                        const int n_dims     = ((int32_t *) dst->op_params)[1];
+                        const int mode       = ((int32_t *) dst->op_params)[2];
+                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
+                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
+                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
+                        [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
+                        [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
+                        [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
+                        [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
+                        [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
+                        [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
+                        [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
+                        [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
+                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_IM2COL:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
+                        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                        GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+                        const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                        const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                        const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                        const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                        const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                        const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+                        const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                        const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                        const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                        const int32_t IH = is_2D ? src1->ne[1] : 1;
+                        const int32_t IW =         src1->ne[0];
+
+                        const int32_t KH = is_2D ? src0->ne[1] : 1;
+                        const int32_t KW =         src0->ne[0];
+
+                        const int32_t OH = is_2D ? dst->ne[2] : 1;
+                        const int32_t OW =         dst->ne[1];
+
+                        const int32_t CHW = IC * KH * KW;
+
+                        const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                        const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
+                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
+                        [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
+                        [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
+                        [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
+                        [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
+                        [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
+                        [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
+                        [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
+                        [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
+                        [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
+                        [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                    } break;
+                case GGML_OP_UPSCALE:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        const int sf = dst->op_params[0];
+
+                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                        [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+
+                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_PAD:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                        const int nth = MIN(1024, ne0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ARGSORT:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                        GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+                        const int nrows = ggml_nrows(src0);
+
+                        enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (order) {
+                            case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                            case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
+                    } break;
+                case GGML_OP_LEAKY_RELU:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        float slope;
+                        memcpy(&slope, dst->op_params, sizeof(float));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
+                        [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+
+                        const int64_t n = ggml_nelements(dst);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_DUP:
+                case GGML_OP_CPY:
+                case GGML_OP_CONT:
+                    {
+                        GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                        int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0t) {
+                            case GGML_TYPE_F32:
+                                {
+                                    GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
+
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
+                                        case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
+                                        case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
+                                        case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
+                                        case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
+                                      //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
+                                      //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
+                                        default: GGML_ASSERT(false && "not implemented");
+                                    };
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
+                                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
+                                        default: GGML_ASSERT(false && "not implemented");
+                                    };
+                                } break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        }
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                        [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                default:
+                    {
+                        GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                        GGML_ASSERT(false);
+                    }
+            }
 
 #ifndef GGML_METAL_NDEBUG
-                [encoder popDebugGroup];
+            [encoder popDebugGroup];
 #endif
-            }
+        }
 
-            if (encoder != nil) {
-                [encoder endEncoding];
-                encoder = nil;
-            }
+        if (encoder != nil) {
+            [encoder endEncoding];
+            encoder = nil;
+        }
 
-            [command_buffer commit];
-        });
-    }
-
-    // wait for all threads to finish
-    dispatch_barrier_sync(ctx->d_queue, ^{});
+        [command_buffer commit];
+    });
 
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)

From 862f5e41ab1fdf12d6f59455aad3f5dd8258f805 Mon Sep 17 00:00:00 2001
From: Neuman Vong <neuman.vong@gmail.com>
Date: Wed, 17 Jan 2024 00:47:34 +1100
Subject: [PATCH 043/138] android : introduce starter project example (#4926)

* Introduce starter project for Android

Based on examples/llama.swiftui.

* Add github workflow

* Set NDK version

* Only build arm64-v8a in CI

* Sync bench code

* Rename CI prop to skip-armeabi-v7a

* Remove unused tests
---
 .github/workflows/build.yml                   |  25 ++
 examples/llama.android/.gitignore             |  33 ++
 examples/llama.android/README.md              |   0
 examples/llama.android/app/.gitignore         |   1 +
 examples/llama.android/app/build.gradle.kts   |  91 ++++
 examples/llama.android/app/proguard-rules.pro |  21 +
 .../app/src/main/AndroidManifest.xml          |  30 ++
 .../app/src/main/cpp/CMakeLists.txt           |  50 +++
 .../app/src/main/cpp/llama-android.cpp        | 394 ++++++++++++++++++
 .../java/com/example/llama/Downloadable.kt    | 119 ++++++
 .../src/main/java/com/example/llama/Llm.kt    | 172 ++++++++
 .../java/com/example/llama/MainActivity.kt    | 154 +++++++
 .../java/com/example/llama/MainViewModel.kt   | 104 +++++
 .../java/com/example/llama/ui/theme/Color.kt  |  11 +
 .../java/com/example/llama/ui/theme/Theme.kt  |  70 ++++
 .../java/com/example/llama/ui/theme/Type.kt   |  34 ++
 .../res/drawable/ic_launcher_background.xml   | 170 ++++++++
 .../res/drawable/ic_launcher_foreground.xml   |  30 ++
 .../main/res/mipmap-anydpi/ic_launcher.xml    |   6 +
 .../res/mipmap-anydpi/ic_launcher_round.xml   |   6 +
 .../src/main/res/mipmap-hdpi/ic_launcher.webp | Bin 0 -> 1404 bytes
 .../res/mipmap-hdpi/ic_launcher_round.webp    | Bin 0 -> 2898 bytes
 .../src/main/res/mipmap-mdpi/ic_launcher.webp | Bin 0 -> 982 bytes
 .../res/mipmap-mdpi/ic_launcher_round.webp    | Bin 0 -> 1772 bytes
 .../main/res/mipmap-xhdpi/ic_launcher.webp    | Bin 0 -> 1900 bytes
 .../res/mipmap-xhdpi/ic_launcher_round.webp   | Bin 0 -> 3918 bytes
 .../main/res/mipmap-xxhdpi/ic_launcher.webp   | Bin 0 -> 2884 bytes
 .../res/mipmap-xxhdpi/ic_launcher_round.webp  | Bin 0 -> 5914 bytes
 .../main/res/mipmap-xxxhdpi/ic_launcher.webp  | Bin 0 -> 3844 bytes
 .../res/mipmap-xxxhdpi/ic_launcher_round.webp | Bin 0 -> 7778 bytes
 .../app/src/main/res/values/colors.xml        |  10 +
 .../app/src/main/res/values/strings.xml       |   3 +
 .../app/src/main/res/values/themes.xml        |   5 +
 .../app/src/main/res/xml/backup_rules.xml     |  13 +
 .../main/res/xml/data_extraction_rules.xml    |  19 +
 examples/llama.android/build.gradle.kts       |   5 +
 examples/llama.android/gradle.properties      |  23 +
 .../gradle/wrapper/gradle-wrapper.jar         | Bin 0 -> 59203 bytes
 .../gradle/wrapper/gradle-wrapper.properties  |   6 +
 examples/llama.android/gradlew                | 185 ++++++++
 examples/llama.android/settings.gradle.kts    |  17 +
 41 files changed, 1807 insertions(+)
 create mode 100644 examples/llama.android/.gitignore
 create mode 100644 examples/llama.android/README.md
 create mode 100644 examples/llama.android/app/.gitignore
 create mode 100644 examples/llama.android/app/build.gradle.kts
 create mode 100644 examples/llama.android/app/proguard-rules.pro
 create mode 100644 examples/llama.android/app/src/main/AndroidManifest.xml
 create mode 100644 examples/llama.android/app/src/main/cpp/CMakeLists.txt
 create mode 100644 examples/llama.android/app/src/main/cpp/llama-android.cpp
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
 create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
 create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/values/colors.xml
 create mode 100644 examples/llama.android/app/src/main/res/values/strings.xml
 create mode 100644 examples/llama.android/app/src/main/res/values/themes.xml
 create mode 100644 examples/llama.android/app/src/main/res/xml/backup_rules.xml
 create mode 100644 examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
 create mode 100644 examples/llama.android/build.gradle.kts
 create mode 100644 examples/llama.android/gradle.properties
 create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.jar
 create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.properties
 create mode 100755 examples/llama.android/gradlew
 create mode 100644 examples/llama.android/settings.gradle.kts

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0a28a1111..367df07a7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -515,6 +515,31 @@ jobs:
       - name: Build Xcode project
         run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
 
+  android-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+
+      - name: Set up JDK
+        uses: actions/setup-java@v3
+        with:
+          java-version: 17
+          distribution: zulu
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@v3
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Build
+        run: |
+          cd examples/llama.android
+
+          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
+          ./gradlew build --no-daemon -Pskip-armeabi-v7a
+
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore
new file mode 100644
index 000000000..347e252ef
--- /dev/null
+++ b/examples/llama.android/.gitignore
@@ -0,0 +1,33 @@
+# Gradle files
+.gradle/
+build/
+
+# Local configuration file (sdk path, etc)
+local.properties
+
+# Log/OS Files
+*.log
+
+# Android Studio generated files and folders
+captures/
+.externalNativeBuild/
+.cxx/
+*.apk
+output.json
+
+# IntelliJ
+*.iml
+.idea/
+misc.xml
+deploymentTargetDropDown.xml
+render.experimental.xml
+
+# Keystore files
+*.jks
+*.keystore
+
+# Google Services (e.g. APIs or Firebase)
+google-services.json
+
+# Android Profiling
+*.hprof
diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore
new file mode 100644
index 000000000..796b96d1c
--- /dev/null
+++ b/examples/llama.android/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
new file mode 100644
index 000000000..7815a8025
--- /dev/null
+++ b/examples/llama.android/app/build.gradle.kts
@@ -0,0 +1,91 @@
+plugins {
+    id("com.android.application")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "com.example.llama"
+    compileSdk = 34
+
+    ndkVersion = "26.1.10909125"
+
+    defaultConfig {
+        applicationId = "com.example.llama"
+        minSdk = 33
+        targetSdk = 34
+        versionCode = 1
+        versionName = "1.0"
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        vectorDrawables {
+            useSupportLibrary = true
+        }
+        ndk {
+            // Workaround for https://github.com/llvm/llvm-project/issues/65820
+            // affecting armeabi-v7a. Skip armeabi-v7a when invoked with
+            // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
+            if (project.hasProperty("skip-armeabi-v7a")) {
+                abiFilters += listOf("arm64-v8a", "x86_64", "x86")
+            }
+        }
+        externalNativeBuild {
+            cmake {
+                cppFlags += listOf()
+                arguments += listOf()
+            }
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+    buildFeatures {
+        compose = true
+    }
+    composeOptions {
+        kotlinCompilerExtensionVersion = "1.5.1"
+    }
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path = file("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
+}
+
+dependencies {
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
+    implementation("androidx.activity:activity-compose:1.8.2")
+    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    implementation("androidx.compose.ui:ui")
+    implementation("androidx.compose.ui:ui-graphics")
+    implementation("androidx.compose.ui:ui-tooling-preview")
+    implementation("androidx.compose.material3:material3")
+    testImplementation("junit:junit:4.13.2")
+    androidTestImplementation("androidx.test.ext:junit:1.1.5")
+    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
+    debugImplementation("androidx.compose.ui:ui-tooling")
+    debugImplementation("androidx.compose.ui:ui-test-manifest")
+}
diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro
new file mode 100644
index 000000000..f1b424510
--- /dev/null
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml
new file mode 100644
index 000000000..41a358a29
--- /dev/null
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools">
+
+    <uses-permission android:name="android.permission.INTERNET" />
+
+    <application
+        android:allowBackup="true"
+        android:dataExtractionRules="@xml/data_extraction_rules"
+        android:fullBackupContent="@xml/backup_rules"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.LlamaAndroid"
+        >
+
+        <activity
+            android:name=".MainActivity"
+            android:exported="true"
+            android:theme="@style/Theme.LlamaAndroid">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
new file mode 100644
index 000000000..85139329a
--- /dev/null
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,50 @@
+
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html.
+# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
+
+# Sets the minimum CMake version required for this project.
+cmake_minimum_required(VERSION 3.22.1)
+
+# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
+# Since this is the top level CMakeLists.txt, the project name is also accessible
+# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
+# build script scope).
+project("llama-android")
+
+include(FetchContent)
+FetchContent_Declare(
+        llama
+        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+        GIT_TAG        master
+)
+
+# Also provides "common"
+FetchContent_MakeAvailable(llama)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+#
+# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
+# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
+# is preferred for the same purpose.
+#
+# In order to load a library into your app from Java/Kotlin, you must call
+# System.loadLibrary() and pass the name of the library defined here;
+# for GameActivity/NativeActivity derived applications, the same library name must be
+# used in the AndroidManifest.xml file.
+add_library(${CMAKE_PROJECT_NAME} SHARED
+    # List C/C++ source files with relative paths to this CMakeLists.txt.
+    llama-android.cpp)
+
+# Specifies libraries CMake should link to your target library. You
+# can link libraries from various origins, such as libraries defined in this
+# build script, prebuilt third-party libraries, or Android system libraries.
+target_link_libraries(${CMAKE_PROJECT_NAME}
+    # List libraries link to the target library
+    llama
+    common
+    android
+    log)
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
new file mode 100644
index 000000000..d5e705dce
--- /dev/null
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -0,0 +1,394 @@
+#include <android/log.h>
+#include <jni.h>
+#include <iomanip>
+#include <math.h>
+#include <string>
+#include <unistd.h>
+#include "llama.h"
+#include "common/common.h"
+
+// Write C++ code here.
+//
+// Do not forget to dynamically load the C++ library into your application.
+//
+// For instance,
+//
+// In MainActivity.java:
+//    static {
+//       System.loadLibrary("llama-android");
+//    }
+//
+// Or, in MainActivity.kt:
+//    companion object {
+//      init {
+//         System.loadLibrary("llama-android")
+//      }
+//    }
+
+#define TAG "llama-android.cpp"
+#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
+#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
+
+jclass la_int_var;
+jmethodID la_int_var_value;
+jmethodID la_int_var_inc;
+
+static void log_callback(ggml_log_level level, const char * fmt, void * data) {
+    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
+    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
+    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
+    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+    llama_model_params model_params = llama_model_default_params();
+
+    auto path_to_model = env->GetStringUTFChars(filename, 0);
+    LOGi("Loading model from %s", path_to_model);
+
+    auto model = llama_load_model_from_file(path_to_model, model_params);
+    env->ReleaseStringUTFChars(filename, path_to_model);
+
+    if (!model) {
+        LOGe("load_model() failed");
+        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
+        return 0;
+    }
+
+    return reinterpret_cast<jlong>(model);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
+    llama_free_model(reinterpret_cast<llama_model *>(model));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+    auto model = reinterpret_cast<llama_model *>(jmodel);
+
+    if (!model) {
+        LOGe("new_context(): model cannot be null");
+        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
+        return 0;
+    }
+
+    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
+    LOGi("Using %d threads", n_threads);
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+    ctx_params.n_threads       = n_threads;
+    ctx_params.n_threads_batch = n_threads;
+
+    llama_context * context = llama_new_context_with_model(model, ctx_params);
+
+    if (!context) {
+        LOGe("llama_new_context_with_model() returned null)");
+        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
+                      "llama_new_context_with_model() returned null)");
+        return 0;
+    }
+
+    return reinterpret_cast<jlong>(context);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+    llama_free(reinterpret_cast<llama_context *>(context));
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+    llama_backend_free();
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+    llama_log_set(log_callback, NULL);
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_bench_1model(
+        JNIEnv *env,
+        jobject,
+        jlong context_pointer,
+        jlong model_pointer,
+        jlong batch_pointer,
+        jint pp,
+        jint tg,
+        jint pl,
+        jint nr
+        ) {
+    auto pp_avg = 0.0;
+    auto tg_avg = 0.0;
+    auto pp_std = 0.0;
+    auto tg_std = 0.0;
+
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto model = reinterpret_cast<llama_model *>(model_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+
+    const int n_ctx = llama_n_ctx(context);
+
+    LOGi("n_ctx = %d", n_ctx);
+
+    int i, j;
+    int nri;
+    for (nri = 0; nri < nr; nri++) {
+        LOGi("Benchmark prompt processing (pp)");
+
+        llama_batch_clear(*batch);
+
+        const int n_tokens = pp;
+        for (i = 0; i < n_tokens; i++) {
+            llama_batch_add(*batch, 0, i, { 0 }, false);
+        }
+
+        batch->logits[batch->n_tokens - 1] = true;
+        llama_kv_cache_clear(context);
+
+        const auto t_pp_start = ggml_time_us();
+        if (llama_decode(context, *batch) != 0) {
+            LOGi("llama_decode() failed during prompt processing");
+        }
+        const auto t_pp_end = ggml_time_us();
+
+        // bench text generation
+
+        LOGi("Benchmark text generation (tg)");
+
+        llama_kv_cache_clear(context);
+        const auto t_tg_start = ggml_time_us();
+        for (i = 0; i < tg; i++) {
+
+            llama_batch_clear(*batch);
+            for (j = 0; j < pl; j++) {
+                llama_batch_add(*batch, 0, i, { j }, true);
+            }
+
+            LOGi("llama_decode() text generation: %d", i);
+            if (llama_decode(context, *batch) != 0) {
+                LOGi("llama_decode() failed during text generation");
+            }
+        }
+
+        const auto t_tg_end = ggml_time_us();
+
+        llama_kv_cache_clear(context);
+
+        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
+        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
+
+        const auto speed_pp = double(pp) / t_pp;
+        const auto speed_tg = double(pl * tg) / t_tg;
+
+        pp_avg += speed_pp;
+        tg_avg += speed_tg;
+
+        pp_std += speed_pp * speed_pp;
+        tg_std += speed_tg * speed_tg;
+
+        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
+    }
+
+    pp_avg /= double(nr);
+    tg_avg /= double(nr);
+
+    if (nr > 1) {
+        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
+        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
+    } else {
+        pp_std = 0;
+        tg_std = 0;
+    }
+
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+
+    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
+    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
+
+    const auto backend    = "(Android)"; // TODO: What should this be?
+
+    std::stringstream result;
+    result << std::setprecision(2);
+    result << "| model | size | params | backend | test | t/s |\n";
+    result << "| --- | --- | --- | --- | --- | --- |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
+
+    return env->NewStringUTF(result.str().c_str());
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+
+    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
+
+    llama_batch *batch = new llama_batch {
+        0,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        0,
+        0,
+        0,
+    };
+
+    if (embd) {
+        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
+    } else {
+        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
+    }
+
+    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
+    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
+    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
+    for (int i = 0; i < n_tokens; ++i) {
+        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+    }
+    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
+
+    return reinterpret_cast<jlong>(batch);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
+    llama_backend_init(numa);
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+    return env->NewStringUTF(llama_print_system_info());
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_example_llama_Llm_completion_1init(
+        JNIEnv *env,
+        jobject,
+        jlong context_pointer,
+        jlong batch_pointer,
+        jstring jtext,
+        jint n_len
+    ) {
+
+    const auto text = env->GetStringUTFChars(jtext, 0);
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+
+    const auto tokens_list = llama_tokenize(context, text, 1);
+
+    auto n_ctx = llama_n_ctx(context);
+    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+
+    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
+
+    if (n_kv_req > n_ctx) {
+        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
+    }
+
+    for (auto id : tokens_list) {
+        LOGi("%s", llama_token_to_piece(context, id).c_str());
+    }
+
+    llama_batch_clear(*batch);
+
+    // evaluate the initial prompt
+    for (auto i = 0; i < tokens_list.size(); i++) {
+        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch->logits[batch->n_tokens - 1] = true;
+
+    if (llama_decode(context, *batch) != 0) {
+        LOGe("llama_decode() failed");
+    }
+
+    env->ReleaseStringUTFChars(jtext, text);
+
+    return batch->n_tokens;
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_completion_1loop(
+        JNIEnv * env,
+        jobject,
+        jlong context_pointer,
+        jlong batch_pointer,
+        jint n_len,
+        jobject intvar_ncur
+) {
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto model = llama_get_model(context);
+
+    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
+    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
+    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
+
+    auto n_vocab = llama_n_vocab(model);
+    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+    // sample the most likely token
+    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+
+    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
+    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+        return env->NewStringUTF("");
+    }
+
+    auto new_token_chars = llama_token_to_piece(context, new_token_id);
+    LOGi("new_token_chars: `%s`", new_token_chars.c_str());
+    auto new_token = env->NewStringUTF(new_token_chars.c_str());
+
+    llama_batch_clear(*batch);
+    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
+
+    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
+
+    if (llama_decode(context, *batch) != 0) {
+        LOGe("llama_decode() returned null");
+    }
+
+    return new_token;
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
new file mode 100644
index 000000000..78c231ae5
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@@ -0,0 +1,119 @@
+package com.example.llama
+
+import android.app.DownloadManager
+import android.net.Uri
+import android.util.Log
+import androidx.compose.material3.Button
+import androidx.compose.material3.Text
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableDoubleStateOf
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.remember
+import androidx.compose.runtime.rememberCoroutineScope
+import androidx.compose.runtime.setValue
+import androidx.core.database.getLongOrNull
+import androidx.core.net.toUri
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.launch
+import java.io.File
+
+data class Downloadable(val name: String, val source: Uri, val destination: File) {
+    companion object {
+        @JvmStatic
+        private val tag: String? = this::class.qualifiedName
+
+        sealed interface State
+        data object Ready: State
+        data class Downloading(val id: Long): State
+        data class Downloaded(val downloadable: Downloadable): State
+        data class Error(val message: String): State
+
+        @JvmStatic
+        @Composable
+        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
+            var status: State by remember {
+                mutableStateOf(
+                    if (item.destination.exists()) Downloaded(item)
+                    else Ready
+                )
+            }
+            var progress by remember { mutableDoubleStateOf(0.0) }
+
+            val coroutineScope = rememberCoroutineScope()
+
+            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
+                while (true) {
+                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
+
+                    if (cursor == null) {
+                        Log.e(tag, "dm.query() returned null")
+                        return Error("dm.query() returned null")
+                    }
+
+                    if (!cursor.moveToFirst() || cursor.count < 1) {
+                        cursor.close()
+                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
+                        return Ready
+                    }
+
+                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
+                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
+                    val sofar = cursor.getLongOrNull(pix) ?: 0
+                    val total = cursor.getLongOrNull(tix) ?: 1
+                    cursor.close()
+
+                    if (sofar == total) {
+                        return Downloaded(item)
+                    }
+
+                    progress = (sofar * 1.0) / total
+
+                    delay(1000L)
+                }
+            }
+
+            fun onClick() {
+                when (val s = status) {
+                    is Downloaded -> {
+                        viewModel.load(item.destination.path)
+                    }
+
+                    is Downloading -> {
+                        coroutineScope.launch {
+                            status = waitForDownload(s, item)
+                        }
+                    }
+
+                    else -> {
+                        item.destination.delete()
+
+                        val request = DownloadManager.Request(item.source).apply {
+                            setTitle("Downloading model")
+                            setDescription("Downloading model: ${item.name}")
+                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
+                            setDestinationUri(item.destination.toUri())
+                        }
+
+                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
+                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
+
+                        val id = dm.enqueue(request)
+                        status = Downloading(id)
+                        onClick()
+                    }
+                }
+            }
+
+            Button(onClick = { onClick() }, enabled = status !is Downloading) {
+                when (status) {
+                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
+                    is Downloaded -> Text("Load ${item.name}")
+                    is Ready -> Text("Download ${item.name}")
+                    is Error -> Text("Download ${item.name}")
+                }
+            }
+        }
+
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
new file mode 100644
index 000000000..5f3270372
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
@@ -0,0 +1,172 @@
+package com.example.llama
+
+import android.util.Log
+import kotlinx.coroutines.CoroutineDispatcher
+import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.flow.flowOn
+import kotlinx.coroutines.withContext
+import java.util.concurrent.Executors
+import kotlin.concurrent.thread
+
+class Llm {
+    private val tag: String? = this::class.simpleName
+
+    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
+
+    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
+        thread(start = false, name = "Llm-RunLoop") {
+            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
+
+            // No-op if called more than once.
+            System.loadLibrary("llama-android")
+
+            // Set llama log handler to Android
+            log_to_android()
+            backend_init(false)
+
+            Log.d(tag, system_info())
+
+            it.run()
+        }.apply {
+            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
+                Log.e(tag, "Unhandled exception", exception)
+            }
+        }
+    }.asCoroutineDispatcher()
+
+    private val nlen: Int = 64
+
+    private external fun log_to_android()
+    private external fun load_model(filename: String): Long
+    private external fun free_model(model: Long)
+    private external fun new_context(model: Long): Long
+    private external fun free_context(context: Long)
+    private external fun backend_init(numa: Boolean)
+    private external fun backend_free()
+    private external fun free_batch(batch: Long)
+    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
+    private external fun bench_model(
+        context: Long,
+        model: Long,
+        batch: Long,
+        pp: Int,
+        tg: Int,
+        pl: Int,
+        nr: Int
+    ): String
+
+    private external fun system_info(): String
+
+    private external fun completion_init(
+        context: Long,
+        batch: Long,
+        text: String,
+        nLen: Int
+    ): Int
+
+    private external fun completion_loop(
+        context: Long,
+        batch: Long,
+        nLen: Int,
+        ncur: IntVar
+    ): String
+
+    private external fun kv_cache_clear(context: Long)
+
+    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
+        return withContext(runLoop) {
+            when (val state = threadLocalState.get()) {
+                is State.Loaded -> {
+                    Log.d(tag, "bench(): $state")
+                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
+                }
+
+                else -> throw IllegalStateException("No model loaded")
+            }
+        }
+    }
+
+    suspend fun load(pathToModel: String) {
+        withContext(runLoop) {
+            when (threadLocalState.get()) {
+                is State.Idle -> {
+                    val model = load_model(pathToModel)
+                    if (model == 0L)  throw IllegalStateException("load_model() failed")
+
+                    val context = new_context(model)
+                    if (context == 0L) throw IllegalStateException("new_context() failed")
+
+                    val batch = new_batch(512, 0, 1)
+                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
+
+                    Log.i(tag, "Loaded model $pathToModel")
+                    threadLocalState.set(State.Loaded(model, context, batch))
+                }
+                else -> throw IllegalStateException("Model already loaded")
+            }
+        }
+    }
+
+    fun send(message: String): Flow<String> = flow {
+        when (val state = threadLocalState.get()) {
+            is State.Loaded -> {
+                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
+                while (ncur.value <= nlen) {
+                    val str = completion_loop(state.context, state.batch, nlen, ncur)
+                    if (str.isEmpty()) {
+                        break
+                    }
+                    emit(str)
+                }
+                kv_cache_clear(state.context)
+            }
+            else -> {}
+        }
+    }.flowOn(runLoop)
+
+    /**
+     * Unloads the model and frees resources.
+     *
+     * This is a no-op if there's no model loaded.
+     */
+    suspend fun unload() {
+        withContext(runLoop) {
+            when (val state = threadLocalState.get()) {
+                is State.Loaded -> {
+                    free_context(state.context)
+                    free_model(state.model)
+                    free_batch(state.batch)
+
+                    threadLocalState.set(State.Idle)
+                }
+                else -> {}
+            }
+        }
+    }
+
+    companion object {
+        private class IntVar(value: Int) {
+            @Volatile
+            var value: Int = value
+                private set
+
+            fun inc() {
+                synchronized(this) {
+                    value += 1
+                }
+            }
+        }
+
+        private sealed interface State {
+            data object Idle: State
+            data class Loaded(val model: Long, val context: Long, val batch: Long): State
+        }
+
+        // Enforce only one instance of Llm.
+        private val _instance: Llm = Llm()
+
+        fun instance(): Llm = _instance
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
new file mode 100644
index 000000000..9da04f7d3
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -0,0 +1,154 @@
+package com.example.llama
+
+import android.app.ActivityManager
+import android.app.DownloadManager
+import android.content.ClipData
+import android.content.ClipboardManager
+import android.net.Uri
+import android.os.Bundle
+import android.os.StrictMode
+import android.os.StrictMode.VmPolicy
+import android.text.format.Formatter
+import androidx.activity.ComponentActivity
+import androidx.activity.compose.setContent
+import androidx.activity.viewModels
+import androidx.compose.foundation.layout.Box
+import androidx.compose.foundation.layout.Column
+import androidx.compose.foundation.layout.Row
+import androidx.compose.foundation.layout.fillMaxSize
+import androidx.compose.foundation.layout.padding
+import androidx.compose.foundation.lazy.LazyColumn
+import androidx.compose.foundation.lazy.items
+import androidx.compose.foundation.lazy.rememberLazyListState
+import androidx.compose.material3.Button
+import androidx.compose.material3.LocalContentColor
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.OutlinedTextField
+import androidx.compose.material3.Surface
+import androidx.compose.material3.Text
+import androidx.compose.runtime.Composable
+import androidx.compose.ui.Modifier
+import androidx.compose.ui.unit.dp
+import androidx.core.content.getSystemService
+import com.example.llama.ui.theme.LlamaAndroidTheme
+import java.io.File
+
+class MainActivity(
+    activityManager: ActivityManager? = null,
+    downloadManager: DownloadManager? = null,
+    clipboardManager: ClipboardManager? = null,
+): ComponentActivity() {
+    private val tag: String? = this::class.simpleName
+
+    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
+    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
+    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
+
+    private val viewModel: MainViewModel by viewModels()
+
+    // Get a MemoryInfo object for the device's current memory status.
+    private fun availableMemory(): ActivityManager.MemoryInfo {
+        return ActivityManager.MemoryInfo().also { memoryInfo ->
+            activityManager.getMemoryInfo(memoryInfo)
+        }
+    }
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+
+        StrictMode.setVmPolicy(
+            VmPolicy.Builder(StrictMode.getVmPolicy())
+                .detectLeakedClosableObjects()
+                .build()
+        )
+
+        val free = Formatter.formatFileSize(this, availableMemory().availMem)
+        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
+
+        viewModel.log("Current memory: $free / $total")
+        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
+
+        val extFilesDir = getExternalFilesDir(null)
+
+        val models = listOf(
+            Downloadable(
+                "Phi-2 7B (Q4_0, 1.6 GiB)",
+                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
+                File(extFilesDir, "phi-2-q4_0.gguf"),
+            ),
+            Downloadable(
+                "TinyLlama 1.1B (f16, 2.2 GiB)",
+                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
+                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
+            ),
+            Downloadable(
+                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
+                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
+                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
+            ),
+        )
+
+        setContent {
+            LlamaAndroidTheme {
+                // A surface container using the 'background' color from the theme
+                Surface(
+                    modifier = Modifier.fillMaxSize(),
+                    color = MaterialTheme.colorScheme.background
+                ) {
+                    MainCompose(
+                        viewModel,
+                        clipboardManager,
+                        downloadManager,
+                        models,
+                    )
+                }
+
+            }
+        }
+    }
+}
+
+@Composable
+fun MainCompose(
+    viewModel: MainViewModel,
+    clipboard: ClipboardManager,
+    dm: DownloadManager,
+    models: List<Downloadable>
+) {
+    Column {
+        val scrollState = rememberLazyListState()
+
+        Box(modifier = Modifier.weight(1f)) {
+            LazyColumn(state = scrollState) {
+                items(viewModel.messages) {
+                    Text(
+                        it,
+                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
+                        modifier = Modifier.padding(16.dp)
+                    )
+                }
+            }
+        }
+        OutlinedTextField(
+            value = viewModel.message,
+            onValueChange = { viewModel.updateMessage(it) },
+            label = { Text("Message") },
+        )
+        Row {
+            Button({ viewModel.send() }) { Text("Send") }
+            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
+            Button({ viewModel.clear() }) { Text("Clear") }
+            Button({
+                viewModel.messages.joinToString("\n").let {
+                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
+                }
+            }) { Text("Copy") }
+        }
+
+        Column {
+            for (model in models) {
+                Downloadable.Button(viewModel, dm, model)
+            }
+        }
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
new file mode 100644
index 000000000..be95e2221
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -0,0 +1,104 @@
+package com.example.llama
+
+import android.util.Log
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.setValue
+import androidx.lifecycle.ViewModel
+import androidx.lifecycle.viewModelScope
+import kotlinx.coroutines.flow.catch
+import kotlinx.coroutines.launch
+
+class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+    companion object {
+        @JvmStatic
+        private val NanosPerSecond = 1_000_000_000.0
+    }
+
+    private val tag: String? = this::class.simpleName
+
+    var messages by mutableStateOf(listOf("Initializing..."))
+        private set
+
+    var message by mutableStateOf("")
+        private set
+
+    override fun onCleared() {
+        super.onCleared()
+
+        viewModelScope.launch {
+            try {
+                llm.unload()
+            } catch (exc: IllegalStateException) {
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun send() {
+        val text = message
+        message = ""
+
+        // Add to messages console.
+        messages += text
+        messages += ""
+
+        viewModelScope.launch {
+            llm.send(text)
+                .catch {
+                    Log.e(tag, "send() failed", it)
+                    messages += it.message!!
+                }
+                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
+        }
+    }
+
+    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
+        viewModelScope.launch {
+            try {
+                val start = System.nanoTime()
+                val warmupResult = llm.bench(pp, tg, pl, nr)
+                val end = System.nanoTime()
+
+                messages += warmupResult
+
+                val warmup = (end - start).toDouble() / NanosPerSecond
+                messages += "Warm up time: $warmup seconds, please wait..."
+
+                if (warmup > 5.0) {
+                    messages += "Warm up took too long, aborting benchmark"
+                    return@launch
+                }
+
+                messages += llm.bench(512, 128, 1, 3)
+            } catch (exc: IllegalStateException) {
+                Log.e(tag, "bench() failed", exc)
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun load(pathToModel: String) {
+        viewModelScope.launch {
+            try {
+                llm.load(pathToModel)
+                messages += "Loaded $pathToModel"
+            } catch (exc: IllegalStateException) {
+                Log.e(tag, "load() failed", exc)
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun updateMessage(newMessage: String) {
+        message = newMessage
+    }
+
+    fun clear() {
+        messages = listOf()
+    }
+
+    fun log(message: String) {
+        messages += message
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
new file mode 100644
index 000000000..40c30e8d9
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@@ -0,0 +1,11 @@
+package com.example.llama.ui.theme
+
+import androidx.compose.ui.graphics.Color
+
+val Purple80 = Color(0xFFD0BCFF)
+val PurpleGrey80 = Color(0xFFCCC2DC)
+val Pink80 = Color(0xFFEFB8C8)
+
+val Purple40 = Color(0xFF6650a4)
+val PurpleGrey40 = Color(0xFF625b71)
+val Pink40 = Color(0xFF7D5260)
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
new file mode 100644
index 000000000..e742220a8
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@@ -0,0 +1,70 @@
+package com.example.llama.ui.theme
+
+import android.app.Activity
+import android.os.Build
+import androidx.compose.foundation.isSystemInDarkTheme
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.darkColorScheme
+import androidx.compose.material3.dynamicDarkColorScheme
+import androidx.compose.material3.dynamicLightColorScheme
+import androidx.compose.material3.lightColorScheme
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.SideEffect
+import androidx.compose.ui.graphics.toArgb
+import androidx.compose.ui.platform.LocalContext
+import androidx.compose.ui.platform.LocalView
+import androidx.core.view.WindowCompat
+
+private val DarkColorScheme = darkColorScheme(
+    primary = Purple80,
+    secondary = PurpleGrey80,
+    tertiary = Pink80
+)
+
+private val LightColorScheme = lightColorScheme(
+    primary = Purple40,
+    secondary = PurpleGrey40,
+    tertiary = Pink40
+
+    /* Other default colors to override
+    background = Color(0xFFFFFBFE),
+    surface = Color(0xFFFFFBFE),
+    onPrimary = Color.White,
+    onSecondary = Color.White,
+    onTertiary = Color.White,
+    onBackground = Color(0xFF1C1B1F),
+    onSurface = Color(0xFF1C1B1F),
+    */
+)
+
+@Composable
+fun LlamaAndroidTheme(
+    darkTheme: Boolean = isSystemInDarkTheme(),
+    // Dynamic color is available on Android 12+
+    dynamicColor: Boolean = true,
+    content: @Composable () -> Unit
+) {
+    val colorScheme = when {
+        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
+            val context = LocalContext.current
+            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
+        }
+
+        darkTheme -> DarkColorScheme
+        else -> LightColorScheme
+    }
+    val view = LocalView.current
+    if (!view.isInEditMode) {
+        SideEffect {
+            val window = (view.context as Activity).window
+            window.statusBarColor = colorScheme.primary.toArgb()
+            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
+        }
+    }
+
+    MaterialTheme(
+        colorScheme = colorScheme,
+        typography = Typography,
+        content = content
+    )
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
new file mode 100644
index 000000000..0b87946ca
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@@ -0,0 +1,34 @@
+package com.example.llama.ui.theme
+
+import androidx.compose.material3.Typography
+import androidx.compose.ui.text.TextStyle
+import androidx.compose.ui.text.font.FontFamily
+import androidx.compose.ui.text.font.FontWeight
+import androidx.compose.ui.unit.sp
+
+// Set of Material typography styles to start with
+val Typography = Typography(
+    bodyLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 16.sp,
+        lineHeight = 24.sp,
+        letterSpacing = 0.5.sp
+    )
+    /* Other default text styles to override
+    titleLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 22.sp,
+        lineHeight = 28.sp,
+        letterSpacing = 0.sp
+    ),
+    labelSmall = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Medium,
+        fontSize = 11.sp,
+        lineHeight = 16.sp,
+        letterSpacing = 0.5.sp
+    )
+    */
+)
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 000000000..07d5da9cb
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
new file mode 100644
index 000000000..7706ab9e6
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
new file mode 100644
index 000000000..b3e26b4c6
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
new file mode 100644
index 000000000..b3e26b4c6
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..c209e78ecd372343283f4157dcfd918ec5165bb3
GIT binary patch
literal 1404
zcmV-?1%vuhNk&F=1pok7MM6+kP&il$0000G0000-002h-06|PpNX!5L00Dqw+t%{r
zzW2vH!KF=w&cMnnN@{whkTw+#mAh0SV?YL=)3MimFYCWp#fpdtz~8$hD5VPuQgtcN
zXl<@<#Cm<R)d19?=)E<{+g@mp0C)CAX%7ksNnpX=jPlJEkqD9%o*fC(U7iySOYHHS
zCLH@bXPyI|^Z)Mc^PG7Oc+NfBJO`d7p5;U`M53Wbd!w|M(MUoNRc2m{@^!wFKzL?&
zx_RAc-^9Azxo%DmXW^9GV0~;n_G9&dym)|QrMEx!y_F=oDunn;1y)cvAc6z{0MHiz
zodGIH07w8nF%*bGq9Gv`YHnm80|d1T$uW=u4vGV%tX#>e5f5yr2h%@8TWh?)bSK`O
z^Z@d={gn7J{iyxL_y_%J|L>ep{dUxUP8a{byupH&!UNR*OutO~0{*T4q5R6@ApLF!
z5{w?Z150gC7#>(VHFJZ-^6O@PYp{t!<r*4*+?g*sysFgiN}}d!-T(>jH(_Z*nzTK4
zkc{fLE4Q3|mA2`CWQ3{8;gxGizgM!zccbdQoOLZc8hThi-IhN90RFT|zlxh3Ty&VG
z?Fe{#<RA5Hg_mG&BZS>9RrRnxzsu|Lg2ddugg7k%>0JeD+{XZ7>Z~{=|M+sh1MF7~
zz>To~`~LVQe1nNoR-gEzkpe{Ak^7{{ZBk2i_<+`Bq<^GB!RYG+z)h;Y3+<{zlMUYd
zrd*W4w&jZ0%kBuDZ1EW&KLpyR7r2=}fF2%0VwHM4pUs}ZI2egi#DRMYZPek*^H9YK
zay4Iy3WXFG(F14xYsoDA|KXgGc5%2DhmQ1gFCkrgHBm!lXG<W<SKpuS2eaDGkL-J-
z!+&UV_N0e<DV*|igwqdC*l{lj0T&&`uq=ycU@f9x0i8jzLa(y*X(YV?PCtUw;ks}p
z4zFn7N(-OG^(9ot1ZhYISOWe9?+l%f6v41n+j<OM$_uJgNP?ZJy}hEMMH=;WiG4I`
zs(bIWwSD<LJnAN(E(Xjr4k(^UYF37F_3f{;E%%FEa&I3I0GbdH@{pD5$m+1PN5CW)
zyZ&*9o#8wstdx@^rci;0B4BP2+H4Y<KbJI5L(bXG(k@`Kp=d>8I5h*uf{rn48Z!_@
z4Bk6TJAB2CKYqPjiX&mWoW>OPFGd$wqro<oln8GL@_6LPC)kg1!M)Y!|NCn7b*0sG
zEN=&c2xMM<>a($ne7EUK;#3V<N-jQ7j($tREa0F&-HzYCQtR#fTCZMRN*ZSm;T7a+
zuxa$}zDL8R9wGYkHb$+gJoiM@z+u{u7a_VUBwtd)bPzHxH}C`W=^2PsBr`s7taBMG
zm#Ss=-o`)W8%%x%>YkXaew%Kh^3OrMht<?zOY6P}#rBhn_hrWY$_P`{#CBR9w?+E6
zt7r#NN-tjxFY{9q75P<|L<ZJBHwn4FhG(&i>jYN?XEoY`tRPQsAkH-DSL^QqyN0>^
zmC>{#F14jz4GeW{pJoRpLFa_*GI{?T93^rX7SPQgT@LbLqpNA}<@2wH;q493)G=1Y
z#-sCiRNX~qf3KgiFzB3I>4Z%AfS(3$`-aMIBU+6?gbgDb!)L~A)je+;fR0jWLL-Fu
z4)<UE%?IC0Du41FrE~F_qc8nOq>P{c7{B4Hp91&%??2$v9iRSFnuckHUm}or9seH6
z>%NbT+5*@L5(I9<vs%8>j@06@(!<!eaZcF<_le1MVaYMg=gRy*f2#IaBH-mJIpy+L
z=Gsbhd6=3>{ZI?U0=pKn8uwIg&L{JV14+8s2hnvbRrU|hZCd}IJu7*;;ECgO%8_*W
Kmw_-CKmY()leWbG

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..b2dfe3d1ba5cf3ee31b3ecc1ced89044a1f3b7a9
GIT binary patch
literal 2898
zcmV-Y3$650Nk&FW3jhFDMM6+kP&il$0000G0000-002h-06|PpNWB9900E$G+qN-D
z+81ABX7q?;bwx%xBg?kcwr$(C-Tex-ZCkHUw(Y9#+`E5-zuONG5fgw~E2WDng@Bc@
z24xy+R1n%~6xI#u9vJ8zREI)sb<&Il(016}Z~V1n^PU3-_H17A*Bf^o)&{_uBv}Py
zulRfeE8g(g6<I*pq!<m%e0eKLCnC;y@4a&(;z@3Oc_yGuA}p}r2a+O=6+01<KP&)j
z?X*V!2PDO_%3er*&=0L^6alyFqZiK_dhy(U3lP;LLerOI%$mpKS51g&5Mj)6#*PVe
zF_(`;R5goZ_A_QeW9~l|wn`DsB!!6;@*G4}iEuP2Ot6s0BdUVMnEezcTDX4#Y(*N4
z%PCB_a77DrWnVI8;$wcJDzUhkF$0WwCu~^;eS7IbaNIi-ro8tUGsu`9t8y&n(KArb
zV_-{Zd`}5Q__NX_45pDj6i?2BDQ58^g~1BnfGwhN=w`Zb9Jh2q7g$_Q$ABGgfGsfU
zQ%Xp}ueAZ7(7KK;B*zUoD8S$_dIs%z0xV#07bPs=!^#3izY*Sh+CVAuM|l6Flv1c)
zL>HFhk_?o_;0@tz?1I+l+Y#Q*;RVC?(ud`_cU-~n|AX-b`JHrOIqn(-t&rOg-o`#C
zh0LPxmbOAEb;zHTu!R3LDh1Q<R(Kya7{I0;4Dacb1&lp`!Jlm{pmgtAx{w^#57P>O
zZTf-|lJNUxi-PpcbRjw3n~n-pG;$+dIF6eqM5+L();B2O2tQ~|p{PlpNcvDbd1l%c
zLtXn%lu(3!<myn;X3ipg7@oW+6O}@J$7hVgi1~F#J<2qhIXme>aNK!V#+HNn_D3lp
z2%l+hK-nsj|Bi9;V*WIcQRTt5j9<byX)%{hZi!H7I(x!SO0tBzPRXWGd1Ke*j*=vy
zyQaHQRY5iP-Q*Z2C#JituSKDnx+Q<*4#r7|x$~NQt44KoTmQ*R>0A<=<I>am+cc`J
zTYIN|PsYAhJ|=&h*4wI4ebv-C=Be#u>}%m;a{IGmJDU`0snWS&$9zdrT(z8#{OZ_Y
zxwJx!ZClUi%YJjD6Xz@OP8{ieyJB=tn?>zaI-4JN;rr`JQbb%y5h2O-?_V@7pG_+y
z(lqAsqYr!NyVb0C^|uclHaeecG)Sz;WV?rtoqOdAAN{j%?Uo%owya(F&qps@Id|Of
zo@~Y-(YmfB+chv^%*3g4k3R0WqvuYUIA+8^SGJ{2Bl$X&X&v02>+0$4?di(34{pt*
zG=f#yMs@Y|b&=HyH3k4yP&goF2LJ#tBLJNNDo6lG06r}ghC-pC4Q*=x3;|+W04zte
zAl>l4kzUBQFYF(E`KJy?ZXd1tnfbH+Z~SMmA21KokJNs#eqcXWKUIC>{TuoKe^vhF
z);H)o`t9j~`$h1D`#bxe@E`oE`cM9w(@)5Bp8BNukIwM>wZHfd0S;5bcXA*5KT3bj
zc&_~`&{z7u{Et!Z_k78H75gXf4g8<_ul!H$eVspPeU3j&&Au=2R*Z<QVlXG0%J7Qu
z`uQlm{Q{cWVD7XACdR6KeMUk-Q7>p#M9$9s;fqwgzfiX=E_?BwVcfx3tG9Q-+<5fw
z%Hs64<N1NYeh_oukcz%rOcU>z)@Q*%s3_Xd5>S4d<X%6~`O&m@p+WTqnB(reB<gqb
zpaA~={ur+R)J6BZ_}KqfN1AF`u0i5>g$s>@rN^ixeVj*tqu3ZV)biDcFf&l?lGwsa
zWj3rvK}?43c{IruV2L`hUU0t^MemAn3U~x3$4mFDxj=Byowu^Q+#wKRPrWywLjIAp
z9*n}<YIhnms>eQ9-gZmnd9Y0WHtwi2sn6n~?i#n9VN1B*074_VbZZ=WrpkMYr{RsI
ztM_8X1)J*DZejxkjOTRJ&a*lrvMKBQURNP#K)a5wIitfu(CFYV4FT?LUB$jVwJSZz
zNBFTWg->Y<QdHGXO6(B7DL40#@QH~&1bt_RGfAlw%_YsP19wAkHXw%~G9G(zw;=yC
z_Wta^hs{<khF)Et{~KQ(Y!<^`L|pYl%vB@$I(;3RmQHq?VZ^(}{nUdkKh|wO|NXu)
ze|eLtM-LNkZU|pzO^)wX4?x7Y#55_{=sp>k0j&h3e*a5><wP*B;A~Y_-J8$UU=+E3
zs|^$XdARfHEBrp-b3qaNg~XRwL;d6S=>B=-xM7dE`IuOQna!u$OoxLlE;WdrNlN)1
z7**de7-hZ!(%_ZllHBLg`Ir#|t>2$*xVOZ-ADZKTN?{(NUeLU9GbuG-+Axf*AZ-P1
z0ZZ*f<D6L!WI}YtFrx~d;ZCS=O$ReN3~!sEoYV$RgCJx3D(Cp-Mie$*C4cS*q~E}&
z0BT11xQ>x+ck4{XtFsbcc%GRStht@q!m*ImssGwuK+P@%gEK!f5dHymg<9nSCXsB6
zQ*{<`%^bxB($Z@5286^-A(tR;r+p7B%^%$N5h%lb*Vlz-?DL9x;!j<5>~kmXP$E}m
zQV|7uv4SwFs0jUervsxVUm>&9Y3DBIzc1XW|CUZrUdb<&{@D5yuLe%Xniw^x&{A2s
z0q1<L&7;HiAPZm8Z=iQR8>+owDSfc3Gs?ht;3jw49c#mmrViUfX-yvc_B*wY|Lo7;
zGh!t2R#BHx{1wFXReX*~`NS-<fA!XHlF+kxYYK8u1|b%w@Tz%ELs#ab^++6I>LpSX
z#TV*miO^~B9PF%O0huw!1Zv>^d0G3$^8dsC6VI!$oK<B%_ozoN7z7_(zzYjWYY9bu
zd)NEdFua83uR-Vf-s4v#aHcT*T0qDHMRnnTV@TqU{LFRZ2dsH&3pJ!02lVAX&;IMb
z^MANDir>DKiXdJt{mGkyA`+Gwd4D-^1qtNTUK)`N*=NTG-6}=5k6suNfdLt*dt8D|
z%H#$k)z#ZRcf|zDWB|pn<3+7Nz>?WW9WdkO5(a^m+D4WRJ9{wc>Y}IN)2Kbgn;_O?
zGqdr&9~|$Y0tP=N(k7^Eu;iO*w+f%W`20BNo)=Xa@M_)+o$4LXJyiw{F?a633SC{B
zl~9FH%?^Rm*LVz`lkULs)%idDX^O)SxQol(3jDRyBVR!7d`;ar+D7do)jQ}m`g$<s
z-6tu{nP5&-otsZNY)-$k`{Pj80gwuW=4gjb+bXY>TevUD5@?*P8)vo<u;hmO(wx=4
zu#Ty4#N8dV+4db_oTh<$^Q+`f9^xq{WR#>a?kEe@_hl{_h8j&5eB-5FrYW&*FHVt$
z$kRF9Nstj<DlnDleF4(_XZ^q<)s2!0YS`L=!d-ZCs(bT}fT({j8NU<*U4dqQq?|<5
zrM4G6K$2co@=m3s4&j>%KRzpjdd_9wO=4zO8ritN*NPk_9avYrsF(!4))tm{Ga#OY
z(r{0buexOzu7+<C7l)}{Nc<qc*P;@OPvjmTK3RfnIjfpHVr4;vhpzPB(e56`ue)+^
zV<puQ4Ra`IJ1<xY9>rw8E08Gxd`LTOID{*AC1m*6Nw@osfB%0oBF5sf<~wH1kL;sd
zo)k6^VyRFU<BuKKXLDd>`)dt*iX^9&QtWbo6yE8XXH?`ztvpiOLgI3R+=MOBQ<kj1
z^+$eZoWa#nXjJMS{t(g~l-@9Ro*c@Zd2iRE?D?Zo&wSDp9cqKFwo)iB{||Ez9c*1E
z4LKsK`*%O!d#7>9<gyqCJnWR~?z%;3dw3=(Pq|GAF4ceN5fzvX+wwedai5kotW7if
w9)|ozV<th{;5oaSc=(C`Xv64I>=rMVgi<*CU%+d1PQQ0a1U=&b0vkF207%xU0ssI2

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..4f0f1d64e58ba64d180ce43ee13bf9a17835fbca
GIT binary patch
literal 982
zcmV;{11bDcNk&G_0{{S5MM6+kP&il$0000G0000l001ul06|PpNU8t;00Dqo+t#w^
z^1csucXz7-Qrhzl9HuH<!ckn_w-(t15itRHmqN0O$B3XH(E|jyV^QXq8=yM`Q**vy
zpEpgQd+no=J<Tlv&+_>B%l>&>1tG2^vb*E&k^T3$FG1eQZ51g$uv4V+kI`0<^1Z@N
zk?Jjh$olyC%l>)Xq;7!>{iBj&BjJ`P&$fsCfpve_epJOBkTF?nu-B7D!hO=2ZR}<p
z;bEy|mw1;}P&gp|0ssKe4*;D3Dlh;r06r}ehC-pC4mGy`3;|+W04+#B4r_x~@mHHy
z)H}bD|I2-n_L$pW;*I)~?=#N<)`92&<$3IR`#<SH6@I&FRQa6xBmQ5wPwJ2PAI(ne
z$L2Yb@JHxb`+bLk*AjR$^`b?pr|?!6=+AboIQ2D-p)UI7x(J0|5(5~ur$_+)`>C%4
zc_9eOXvPbC4kzU8YowIA8cW~Uv|eB&yYwAObSwL2vY~UYI7NXPvf3b+c^?wcs~_t{
ze_m66-0)^{JdOMKPwjpQ@Sna!*?$wTZ~su*tNv7o!gXT!GRgivP}ec?5>l1!7<(rT
zds|8x(qGc673zrvYIz;J23FG{9nHMnAuP}NpAED^laz3mAN1sy+NXK)!6v1FxQ;lh
zOBLA>$~P3r4b*NcqR;y6pwyhZ<hjKiZs6mOSFB&+cIl`GV$93-<ciUjF#*1^<p~gh
ziQ_{)r0dA7$It&Fe=obxu8n!+elxmgqxPbUL!FxW0;AOfqz@8JOz9Qbm)m-9!^7D)
z480@BoIIb<oT``+rVla8L)8fXO&6}3P9n4v$`6WG<DUNWuKb9J9rUsAn7d-_YWT^U
z{NXl@OAPIJ!>3_PiDb|%n1gGjl3ZU}ujInlP{eks-#oA6>rh&g+!f`hv#_%JrgYPu
z(U^&XLW^Q<WhKYr9rXr6*~Tmpuq6NjnD6;;NNBGIg-1ZvfACQ4{ocrwM0)?`oL2ts
zCXY5KT@`(ir63J0?%+_(-dDgf<6R$u{lCdy6Zi5d+Bf;1OXyD;xe3#Gug*&T|0o41
zD8;$|JvUv&@vsLIH&C5+S{!k&{~Z54^y@9r>X7F9Z*SRPpQl{B%x)_AMp^}_v~?j7
zapvHMKxSf*Mtyx8I}-<*UGn3)oHd(nn=)BZ`d$lDBwq_GL($_TPaS{UeevT(AJ`p0
z9%+hQb6z)U9qjbuXjg|dExCLjpS8$VKQ55VsIC%@{N5t{NsW)=hNGI`J=x97_kbz@
E0Of=7!T<mO

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..62b611da081676d42f6c3f78a2c91e7bcedddedb
GIT binary patch
literal 1772
zcmV<I1{3*GNk&HG1^@t8MM6+kP&il$0000G0000l001ul06|PpNL&H{00E$D+qP-j
z>Qj4N+cqN`nQhxvX7dAV-`K|Ub$-q+H-5I?Tx0g9jWxd@A|?POE8`3b8fO$T))xP*
z(X?&brZw<itFzD+K&M3~p69>({`)WU&rdAs1i<RDIiSY82S2mupC8Pt4!H6t1GTb(
zWRM~Q$%>T<R+Yg3{a%pbg++@O@<l(ulw^R7DJ5kYQ(?LhFeMn^80iDc8a#OdFhyzL
zDn(d!5nfX;MJV7nMVO%oPb#QF78@wSOhvdEwtz$5l){XK=|H&u(ZCCOX72e)L;uHO
z1tnw`glk~|XjH3U$_P_d)`A8s=1~}>a0x6F@PIxJ&&L|dpySV!ID|iUhjCcKz(@mE
z!x@~W#3H<)4Ae(4eQJRk`Iz3<1)6^m)0b_4_TR<yeHWl(T-||IU&i!Rd!TMUruU72
z<l~rLRD-qWW4hw3Q)?MQ93gOvat1wqq{JcosXwejji>Z+cz#eD3f8V;2r-1fE!F}W
zEi0MEkTTx}8i1{`l_6vo0(Vuh0HD$I4SjZ=?^?k82R51bC)2D_{y8mi_<vjH0E1*B
zfk*0C6jY|!Rf=RG!W+$uDg^D?-XzoVrR42)&Y)P6w7*9BP@dq)8yymh;%(CA$R7+o
zloov8A4l6H7NzQ3vsrJ*;3X6j#0T=toMt(L(p9c**S(b_DMgZG<-Trpa|&g3Ns}I1
zKlp(~|M0=q9!(O5aw}K0apy5RZoJ5U{>?X^=U?2|F{Vr7s!k(AZC$O#ZMyavHhlQ7
zUR~QXuH~#o#>(b$u4?s~HLF*3IcF7023AlwAYudn0FV~|odGH^05AYPEfR)8p`i{n
zwg3zPVp{+wOsxKc>)(pMupKF!Y2HoUqQ3|Yu|8lwR=?5zZuhG6J?H`bSNk_wPoM{u
zSL{c@pY7+c2kck>`^q1^^gR0QB7Y?KUD{vz-uVX~;V-rW)PDcI)$_UjgVV?S?=oLR
zf4}zz{#*R_{LkiJ#0RdQLNC^2Vp%JPEUvG9ra2BVZ92(p9h7Ka@!yf9(lj#}>+|u*
z;^_?KWdzkM`6gqPo9;;r6&JEa)}R3X{(CWv?NvgLeOTq$cZXqf7|sPImi-7cS8DCN
zGf;DVt3Am`>hH3{4-WzH43Ftx)SofNe^-#|0HdCo<+8Qs!}TZP{HH8~z5n`ExcHuT
zDL1m&|DVpI<IwA;|3z1u>y=xsLO>8k92HcmfSKhflQ0H~9=^-{#!I1g(;+44xw~=*
zxvNz35vfsQE)@)Zsp*6_GjYD};Squ83<_?^SbALb{a`j<0Gn%6JY!zhp=Fg}Ga2|8
z52e1W<DmYW|KhLyAh*AQ$=bd-79$cFL1=dC7E!?lJ(DK_A2rbd*I!fTiWjU@hO@LO
z{34r?8R+y6;5?)6c=hv86*TVD<6h<-YN#p%M+B*z{-U|t?d%$+^@~OhgQ=;&eE7WW
zQMm4(i7@Afmhf}Dnwx!Q1lKgexn~licBP}_&7QY=>U%^L1}15Ex0fF$e@eCT(()_P
zvV?CA<sp1RgQ~qYDHIC(K$HgNSDgI7aFI{AcoU=(>%#Sy08_U6VPt4EtmVQraWJX`
zh=N|WQ>LgrvF~R&qOfB$!%D3cGv?;Xh_z$z7k&s4N)$WYf*k=|*jCEkO19{h_(%W4
zPuOqbCw`SeAX*R}UUsbVsgtuG?xs(#Ikx9`JZoQFz0n*7ZG@Fv@kZk`gzO$HoA9kN
z8U5{-<bq**{p6!H-(%Tic#_E`wcN6#HU8-OK@OS$MA~<4ln|3Duf90UXNW1nMhk@X
z!<X~il$GI)0FveT${!;q6+#ptj}^6#CM6bt!8aB|<oIwiQzNU~!^v#E0ATVF@f>yY
zvV{`&WKU2$mZeoBmiJrEd<YP=_2@e1bJ|tRh6}2@09)72_kFh|s|{=Q%;lrD1V0sq
z5(|fB{Q};57E-A$Y;tLp9MPkkDs1?cxgaM#DX)SROj{lUu_=U;L%&QSd(1lwW9=M~
zPXv~y>zUZAv1sRxpePdg1)F*X^Y)zp^Y*R;;z~vOv-z&)&G)JQ{m!C9cmziu1^nHA
z`#`0c>@PnQ9CJKgC5NjJD8HM3|KC(g5nnCq$n0Gsu_DXk36@ql%npEye|?%RmG)<p
z|22C&(o0<{zD=}o7hFmrnHiNsKS+q5do@k^v7dAg(j37~!7%msUYhV9SAD*hicVK@
zd=IyocF&y5dH^sh4`7M2vQg8OP##~+Eu~vo(S~k<e%FqF9ffGv{w_F?KH5TRvvnu}
O>FJ$wK}0tWNB{uH;AM~i

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..948a3070fe34c611c42c0d3ad3013a0dce358be0
GIT binary patch
literal 1900
zcmV-y2b1_xNk&Fw2LJ$9MM6+kP&il$0000G0001A003VA06|PpNH75a00DqwTbm-~
zullQTcXxO9ki!OCRx^i?oR|n!<8G0=kI^!JSjFi-LL*`V;ET0H2IXfU0*i>o6o6Gy
zRq6Ap5(_{XLdXcL-MzlN`ugSdZY_`jXhcENAu)N_0?GhF))9R;E`!bo9p?g?SRgw_
zEXHhFG$0{<gIr?LrJWRzItY~y<Z<EAV-uj3XnE%(*emp8D=Y7PQV-i%2@c@D|9<;;
zH`2jMaL`24BPUPYdJ=PY$>qYOqhdX<(wE4N@es3VIo$%il%6xP9gjiBri+2pI6aY4
zJbgh-Ud|V%3O!IcHKQx1FQH(_*TK;1>FQWbt^$K1zNn^cczkBs=QHCYZ8b&l!UV{K
z{L0$KCf_&KR^}&2Fe|L&?1I7~pBENnCtCuH3sjcx6$<!T3RX}!APxoq0Pr3FodGIf
z0AK(<F%pMDq9F}cg8&c#f?65g)=yLs(=CjK<M1`M`}BX}chg7A2kI}XuSf^#uUY@|
zuTu{#uVwGqpV;qc@BVqrAKdd@@EN}QTvvI{5IcyyCGks*4qjQY^_27g{caAK)e)>c
zwqkNkru);ie``q+_QI;IYLD9OV0ZxkuyBz|5<<a3;s18CO(E<oo}Dr9FcHGkDGekL
z%8=D|QqLZ6i9<4n7z0@0Z!*y6<{tFE?Q(JSs1PS)KpVZ-UuvI<x@J>$1BH|vtey$>
z5oto4=l-R-Aaq`Dk0}o9N<n-Lxhke(>0VrkqW_#;!u{!bJLDq%0092{Ghe=F;(kn}
z+sQ@1=UlX30+2n<VrE^M(W<0s>WjkL$B^b!H2^QYO@iFc0{(-~yXj2TWz?VG{v`Jg
zg}WyYnwGgn>{HFaG7E~pt=)sOO}*yd(UU-D(E&x{xKEl6OcU?pl)K%#U$dn1mDF19
zSw@l8G!GN<gn3;8HSds=>FB3c3VVK0?uyqN&utT-D5%NM4g-3@Sii9tSXKtwce~uF
zS&Jn746EW^wV~8zdQ1XC28~kXu8+Yo9p!<8h&(Q({J*4DBglPdpe4M_mD8AguZFn~
ztiuO~{6Bx<ZU5#l%0-dq__bYvK~-`BMo2EW*Vk@0Uv@y205m+Q&aq=TSlpam*A$L@
zZ$K+cMvxib3m9dD17_p){u>?SfO~_ZV(GIboeR9~hAym{{fV|VM=77MxDrbW6`ujX
z<3HF(>Zr;#*uCvC*bpoSr~C$h?_%nXps@A)=l_;({Fo#6Y1+Zv`!T5HB+)#^-Ud_;
zBwftPN=d8Vx)*O1Mj+0oO=mZ+NVH*ptNDC-&<HMad!<Q5dhOvyth5Fc&!i0MbxZ%N
zU%|-$yCvba94#fAF;MI_OEH#`2k(1(gihK2jMyvsOoHYgzVHUqgQ68^-GY7|rOOyF
zoC~vHfip03zI!qe_AurbxIn0~<I(%>zZ7Hwho6UQ#l-yNvc0Cm+2$$6YUk2<tEyER
zK*7f=uUP>D2t#vdZX-u3>-Be1u9gtTBiMB^xwWQ_rgvGpZ6(C@e23c!^K=>ai-Rqu
zhqT`ZQof;9Bu!AD(i^PCbYV%yha9zuoKMp`U^z;3!+&d@Hud&_iy!O-$b9ZLcSRh?
z)R|826w}TU!J#X6P%@Zh=La$I6zXa#h!B;{qfug}O%z@K{EZECu6zl)7CiNi%xti0
zB{OKfAj83~iJvmpTU|&q1^?^cIMn2RQ?jeSB95l}{DrEPTW{_gmU_pqTc)h@4T>~&
zluq3)GM=xa(#^VU5}@FNqpc$?#SbVsX!~RH*5p0<xA(I&qyn@)(mw0@a&Pg3L>p@w
z;~v{QMX0^bFT1!cXGM8K9FP+=9~-d~#TK#ZE{4umGT=;dfvWi?rYj;^l_Zxywze`W
z^Cr{55U@*BalS}K%Czii_80e0#0#Zkhlij4-~I@}`-JFJ7$5{>LnoJSs??J8<Z-XK
zj&@i^7ta>kWVl6|8A}RCGAu9^rAsfCE=2}tHwl93t0C?#+jMpvr7O3`2=tr{Hg<Kw
z(MWiv`>$=HlnjVG^ewm|Js0J*kfPa6*GhtB>`fN!m#9J(sU!?(OSfzY*zS(FJ<-Vb
zfAIg<xCfqXs~;Xmq<7KOO96xsPR{hU&apj;5A)}6v`#`8fe>+`U)YaXv#sY(c--|X
zEB+TVyZ%Ie4L$gi#Fc++`h6%vzsS$pjz9aLt+ZL(g;n$Dzy5=m=_TV(3H8^C{r0xd
zp#a%}ht55dOq?yhwYPrtp-m1xXp;4X;)NhxxUp<Z`Z=zPQ_3&gbp_8a`>gP%XTLmO
zcjaFva^}dP3$&sfFTIR_jC=2pHh9kpI@2(6V*GQo7Ws)`j)hd+tr@P~gR*2gO@+1?
zG<`_tB+LJuF|SZ9tIec;h%}}6WClT`L>HSW?E{Hp1h^+mlbf_$9zA>!ug>NALJsO{
mU%z=YwVD?}XMya)Bp;vlyE5&E_6!fzx9pwrdz474!~g(M6R?N?

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..1b9a6956b3acdc11f40ce2bb3f6efbd845cc243f
GIT binary patch
literal 3918
zcmV-U53%r4Nk&FS4*&pHMM6+kP&il$0000G0001A003VA06|PpNSy@$00HoY|G(*G
z+qV7x14$dSO^Re!iqt-AAIE9iwr$(CZQJL$blA4B`>;C3fBY6Q8_YSjb2%a=fc}4E
zrSzssacq<^nmW|Rs93PJni30R<8w<(bK_$LO4L?!_OxLl$}K$MUEllnMK|rg=f3;y
z*?;3j|Nh>)p0JQ3A~rf(MibH2r+)3cyV1qF&;8m{w-S*y+0mM){KTK^M5}ksc`qX3
zy>rf^b>~l>SSHds8(I@hz3&PD@LmEs4&prkT=BjsBCXTMhN$_)+kvnl0bLKW5rEsj
z*d#KXGDB4P&>etx0X+`R19yC=LS)j!mgs5M0L~+o-T~J<oyc-(l&0FxfDJ)vWdrzG
zjkHRMCVIq8fJ3SsaN{G0bSezdyMc{>l!p!AJxnGAhV%~rhYUL4hlWhgES3Kb5oA&X
z{}?3OBSS-{!v$nCIGj->(-TAG)8LR{htr41^gxsT8yqt2@DEG6Yl`Uma3Nd4;YUoW
zTbkYl3CMU5ypMF3EIkYmWL|*BknM`0+Kq6CpvO(y$#j94e+q{vI{Zp8cV_6RK!`&C
zo<pW1O@mj#Ba$B1jF9e#KLC$tdVGRA(KNLm5)Z-c3uM|e{5g0;)Z=U1o}r1okeCSe
z&690M^SdF4s^BB6+fY=x1U@bvmsd$`X83VHh)V#T!DbU?^&>b$*5Q|$IZ09dW=L!V
zw@#2wviu|<#3lgG<y?|hUxpyMg6}AupvayTr}GM=TMW<L9;Z9j*(N<6al*6Nv}pBq
zNQh4md`M{`Vm9A~M}$3oY?+A^<^B<?{}o6PDK4EKtBb3wi8R-)jnxf}q~=aYj0C%v
z6V$@(vAW|xm9TnOtnNN6L9fN@aGkJpd#vs_IB9lgtah&@ZNCmaMjkgzYeS?|_54^}
zTvwWi)xbYv^}ni8L~Kgmjn&V}hKa})-h&Y069PU_u+-A`bU@-Gz>E8GEhcx+zBt`}
zOwP8j9X%^f7i_bth4PiJ$LYtFJSCN$3xwDN;8mr*B;CJwBP2G0TMq0uNt7S^DO_wE
zepk!Wrn#Z#03j{`c*Rf~y3o7?J}w?tEELRUR2cgxB*Y{LzA#pxHgf}q?u5idu>077
zd^=p)`nA}6e`|@`p?u}YU66PP_MA}Zqqe!c{nK&z%Jwq1N4e_q<#4g^xaz=ao;u|6
zwpRcW2Lax=ZGbx=Q*HhlJ`Ns#Y*r0*%!T?P*TTiX;rb)$CGLz=rSUum$)3Qyv{BL2
zO*=OI2|%(Yz~`pNEOnLp>+?T@glq-DujlIp?hdJeZ7ctP4_OKx|5@EOps3rr(pWzg
zK4d3&oN-X2qN(d_MkfwB4I)_)!I_6nj2iA9u^pQ{;GckGLxBGrJUM2Wdda!k)Y>lq
zmjws>dVQ*vW9lvEMkiN3wE-__6OWD0txS&Qn0n22cyj4Q*8(nG4!G{6OOwNvsrPIL
zCl-$W9UwkEUVuLwyD%|inbOF*xMODZ4VMEVAq_zUxZ+K#Gdqf!DW$5f)?7UNOFMz!
zrB~tuu=6X2FE(p^iqgxr+?ZK;=yz`e;C$#_@D9Lj-+TDVOrva>(#*PVbaHO>A)mhl
z07OJWCqYC60518$!&c`eNBcBW%GnfaQ*$eazV^2_AW?j)h;J1nUjN(I9=0+!RVx~%
z3@Tf!P0TE<o$!VqowG;KvFtwQM{hV`ZE0qrR<w#Ts%&9+`_$a>+98jA?WceK-}A1%
zW!K)lyKcGqy#M~})315-A#2NXQ`?6NR#Apo=S!oF=JfpX>iR*49ec{7AN$xxpK{D$
z2d%Fz&rdfSqourN$~Y^NFIMV1CZ?J*bMx~H3k&meGtH@q9ra2vZxmA$S(#jaaj-g4
ztJmxG+DLV<*q<|sDXPp$X>E)#S}Vm&sRaO5P&goh2><}FEdZSXDqsL$06sAkh(e+v
zAsBhKSRexgwg6tIy~GFJzaTxXD(}|+0e<LznCgHsU8?@8?t|e+t3NOg)4%V%QT)RG
z#(vWKzWPe^0R3j`BK^Sj0R4vax&4^<HvOypv-k`BiT}~o0m7^OSGJ$@KajnJ{#EwN
zDeve%EWSc^7qvI|&GkIv!CUKJ?z|=S5$~^*hbW!^KEXeU|8)Of{R8r6<YPXsPJiI{
z3I0p{JN=jUpWV;#UH-pt{fqxv+)or1tENbj$yb}h1W}{VuIxcdxr4O$Pk-W+vE;HW
zs>OwFDA%rn`X;MVwDHT9=4=g%OaJ9s%3b9>9EUTnnp0t;2Zpa{*>mk~hZqItE_!dQ
zOtC>8`$l|mV43Jbudf0N6&&X;{=z}Zi}d1`2qmJ}i|0*GsulD3>GgQXHN)pkR6sf1
z?5ZU%&xtL}oH;YiAA)d*^Ndw2T$+Mjuzyzz@-SM`9df7LqTxLuIwC~S0092~+=qYv
z@*ja;?Wt!T!{U?c*Z0YtGe)XbI&y-?B&G2$`JDM)(dIV9G`Sc#6?sI60de6kv+)Qb
zUW~2|WjvJq3TA8`0+sWA3zRhY9a~ow)O~&StBkG2{*{TGiY~S8ep{V&Vo2l<6LWsu
z^#p0-v*t2?3&aA1)ozu|%efSR=XnpX$lvT<i5fh}s=@+>eRdKlvM!@|pM5p2w3u-6
zU>}t2xiYLS+{|%C65AzX+23Mtlq?BS&YdYcYsVjoiE&rT>;Necn6l^K)T^lmE`5u{
zm1i+-a-gc;Z&v-{;8r)z6NYfBUv+=_L}ef}qa9FX01)<p5}9zArZ*5BNNPrYJe?q^
zoGwf&5As9!{7(Mh#&*CXqg;f?QnQ-nlaTt)rSHVHCm`47n7&FR=c_u*_Tb`8rUm3H
z0O9JxAZpoqT#O$8lO#-qLUxwg2QFpWD)MH~tWW!FJ@rL#Z3X@-EA+a_!T&{YBN@VU
z#uLh{fnX}ph>+Aaf+;xj(mL6|JUzGJR1|fnanb%?BPPIp>SCjP|8qE5qJ{=n5<?FH
zyc@=a;51oWzuVAcj6pr}S4=V^1y$yRMekrgPiZC)AMQEB*qQt?gOx<6n-Ze<xOk%8
zJlp{hn2r5lN&v>ZGw?8<T=j<kiK3k}QNf{mJrZ8{h9VJ5mymJ}tharUQVZ+A)q|JA
zP<4CV&CzPUYMZ;!LAXmAxQKNOUhvT9Hs7xDmh*<vTKo#A=V}0C%3}Bd)|`ucui<U}
zkh|*TSU#9=A^@TE$st=m>1z3(k;pzH%1CtlX50{E7h)$h{qGKfzC`e2o`*IqA#tjA
z`Fz&^%$b9F*N`)U-#6>a)Z`55`$Dd0cfcs0$d13^ONrdCu9x<t+8g+uSD5W2zwlhC
z|JVEzcPV$NtS$c<FJ8rBZ;INEx+Q=2(FJ^S342`@#MjKlFl)sF8^7THVLheX^i?L?
z&CPm=G=y+=EKRt@v8Clr<)efd)hKaE^n%ZPKLi%wwD38M$NzP!(WBLE?qcvP01sx9
z&*Nj-pc7`@jq=MVuj=Qp>cv_=n#WQo8stcz3jP9|2EvdI-RhJM3%Q%oM&!OlShM|0
z?gz?<?7WW8=Q%s)y_zh$<gKU|U@-;T<hp_neb-hC9;eMWIi~L=ZQC!2-eBW=SL{}p
z$?;Q@X@<Q+-HdRo>wHZSnm45njLtsz8PVT1S&jAlbKg5kVam$p16=EK@Sj4EP0OtH
zmJDmdc^v)x>56Qg_wmYHz6h)>kl_h$>0@J!ypv%APmjZTAQVLy6Fu50RGY&JAVNhx
zrF_qG6`x9MkT;1S<Ag2tDFJ87lYSIU+mImGAD&|@nwJc#mdqwB-2t$i{E|O|iC+rn
zTx&X1e_l?93I&#?`F=sa9qG87|KIc6S%E@vyQNP?qi0>FWo$)l{M$;3qUDn9JwE}z
zRl#E_bDRJFii61kPgBybIgp8dNW!Cc1b*^YYk-#oWLJvtM_v^hQx~9?8LD4VFFxBF
z3MlrsSC%f9Oupn*ctPL0U1fwfX<e8j*;kJ8_CN6%nCTqo2`3d9Pst}VgQjU)?(M7p
zzxo&&R>?`tRhPD{PSLFPQOmIt$mDy0SgpNVvHS+f#Do>h1Gn?LZU9(KaN>Q_=Y*_T
zvtD7%_u^^+{g`0VGzg(VZrpVQ<iSLmVH#_?Ygs~6CEv!IHC;9@ugl#8Bd(1@U8J`m
zZPR+rwS3E7Io$PJ#u@SZ7*ofWJeNkkZzfy5$#`y(gV@Mrz3MQq!<5HDiA{dy{A6&s
zm;xq~CnA00hNM6ID4qQ25IVwnMQJks`iwc)#g`8-cX!e+83#89|3i9nc;W|OlG5lT
z#`=rnOh~`2$itxg{QZs*tGy>6Ub5M=tI_p7T93R8@3Zulu3|#{iNcu!oiHxZ4Rf*(
zfmiN$$ru(*_Zqn=`Gq#OuHRTSwp7uH_SokR&|)RuW5yo=Z|_4?qU-JU+tpt>!B&Is
z@N(=SG;bpV<x5xb4+$A4;kTvxjvLCmS(Qzk7DoqV?c3gPc^$ajYmd|>c;AO@zbmMM
zScqq1)b-ZQIrs={oD}|?6y{$HNB1U0^LsBh8JI&3!GBZxOXI<}&5-$lgkAaYqhOTb
z?2vEnZ$-kk;*M_17(upJF3%+iH*s0-r{vttXVB2OUwI1s^+G(Ft(U8gYFXC}#P&E^
z>T@C^tS`Z7{6HT4_nF~n>JlZtk5&qDBl6r|^kzQYe`wq!C)n@$c>WOPA61NDFj<<6
zGW71NMMhwAl!U-yqrq2xrSFqRCI8acw7?}3j;ynxo*-b7Co;g5r%^j=H@9({PXXBf
z@r>U>>N;E)81wx`B4f%{PB~MHka_);%kBCb(d|Jy5!MqJ%2p`t&@L)4$T2j&-WHvG
zv3(uyA_gwqNu(k?jQTtv3dgPKRZoH8prxe7>pQBW5L&dpumS&5Ld2?(sCpJjvc4L5
zEnh&?91WVm)ZdTj=fjJ$<vc{WSJ<ii^$T&iG*Yv9jX<?z?mPY(t5t-1^A0*RQs5X?
zYSLjXl&MWC!|=j$?-@JVu!#TF`ZHTW&ulyLWq^6N!VAX2Xmm)BA=Yu5B~k=gi4VJ{
zIG~`oyZBm%<a3bH1xUE^?HI_r5%K8}A8v#m>pPDdgAttLXuke+?KdKxu<Qg!^11Y?
cG7e%GKbg~lPT|05mMxkl$h;o@5^?|l06hZIiU0rr

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..28d4b77f9f036a47549d47db79c16788749dca10
GIT binary patch
literal 2884
zcmV-K3%m4ENk&FI3jhFDMM6+kP&il$0000G0001w0055w06|PpNY()W00EFA*|uso
z=UmW3;Ri7@GcyiBW{ey$jes55b5S`|ZVZ{(x$xch{z?D+^{yErVgleVwa9qvGt40r
z42;MG=7<0QySlzE=Ig6%01!FBK^$Fsxe@Hfe6aCy?Wh2r0~}@_lQAF90oTUi0FhEr
z#(<GhM2CTE5->*;kTC<I6%bkw<P!?WpaDI10CfmFPKu1G=p;%VjNdWOy5JfR^IubB
zmWbY`5VOa4^Co4?lA*0$&a)>(r!tQk6;gxj4h%FdHAt(^M3YvYj(!tOeN)+Hvj6+<
zzyJRG?^lZfWuR#t!tUKP&(?%3v&Zd<cNCc=qEAh>$R2YN>lB(Lq`OInY48%4%yTv2
zYe1{G`3)(PDEio5Y@-I5tUf`c%%O<RR41hl;a9a>CJMtSW56g3iEg%3`$7XSJJHyA
z<|7&N)5Xrlgv~%BO24eFd;Hd;uiK%D`EdK|quUeRZDqbh9l)%j%J#0lfrZumvA<_w
zu&=AVvdChf6}eqh(bUz`(`Ue*p01{fBAcTgKyDYLs_I+YyJEk+rM@avU~>fB$n)HS
zM7pfJydu`i%gfS<LN|{i=ttzza$L{zW8L#y$C4ZoauSg-Za~HmA&1d`@TXE%P&gn!
z2><{PF94kZDv$t>06sAkheDzu40NJ$5CMW%n^Lls?8^p^QGWURbKu3ZduZQZ((s2?
zzE`}<{<HFD`<Klx_&=&1@jStQ!GBWei{=~oAN3#D|91cP9<hA@KTtaAf1`iE_5l7p
z{_oR6_8;Nf_OJGn;4k(%=}+Tjk)AMkcQv~rTZfv5m>;Zt7<$C|9R8A~DJ~@%x>TfP
zF>TX8)@v|t)q4GjRt<}5s6hLHwRel7>V@&r-O|Av(yh;Q1A{E>Ir>p+%dHD|=l+lT
zpr(Dg&>#Nu=!)6bCLr-ZS%|;h)Ij$+e@r8_{qO19QvDe=&1tmpY*0lcA^Cc-#{9fQ
z<~$*<&P$Q<_jy#<$40PMofM7aQ}C=jphI`4kLg}Z7CIN#<BNcF_3BC#1Or(Pa6X(x
zm*>26D{-4v-_CA-LiE@(%{y!BzsU%gG`Q?sjLUf%qFSl0y)2#ae*+EI><i9^|McQ6
z&u-wt#o7y!{eZ1mUVdD*Za<g;gIMqY0RI1A80-K3n!MCY=3j$fj0a3c7yP%@*`6F<
zo`Ip>s|i`d^V$Dn)qmzqRq6VJRY|{4ujsIU%#bnqU6MR&-1I_43=|5(6Jr;Jvert)
zE?S|Tmn}Tv<-??sxV5@9t}3D=>YZ0JrQe$CO~|EY=Lj9RM&4svQHPQL6%pV5fPFiH
zfXDx;l@~et{*{U*#c#Dvzu)<y{p<iKevLm%@24Es{u1>|znDO7$#CRx)Z&yp-}<F^
z`~J$vWM;oQpQO>SrD{&|(MQtfUz~n35@RLfUy=aqrhCX0M}J_r5QsK~NmRCR|Nm&L
z41UdsLjWxSUlL41r^0K&nCCK>fdR-!MYjFg(z9_mF^C|#ZQw?`)f6uVzF^`bRnVY&
zo}@M06J&_+>w9@jpaO4snmU;0t-(zYW1qVBHtuD!d?%?AtN7Plp><-1Y8Rqb20ZaP
zTCgn*-Sri4Q8Xn>=gNaWQ57%!D35UkA@ksOlPB*Dvw}t02ENAqw|kFhn%ZyyW%+t{
zNdM!uqEM^;2}f+tECHbwLmH*!nZVrb$-az%t50Y2pg(HqhvY-^-lb}>^6l{$jOI6}
zo_kBzj%8aX|6H5M0Y<)7pzz_wLkIpRm!;PzY)9+24wk2&TT{w--phDGDCOz{cN_ca
zpnm7`$oDy=HX%0i-`769*0M6(e5j-?(?24%)<)&46y0e&6@HCDZAm9W6Ib#Y#BF6-
z=30crHGg+RRTe%VBC>T00OV6F+gQDAK38<n*vA8r%O6>Ne3N9bm|62tPccBJi)5{B
z4zc^Db72XiBd}v$CF|yU{Z=M|DZ%-(XarYNclODlb1Kz1_EKLy(NSLCN`eUl(rBCL
zT*jx@wNvze0|TSqgE(QArOZU)_?qH(sj#TwzElLs9q)(0u!_P|R%Cy_0JFQxgGV>1
zz4?_uq<8_gM0`c*Hh|;UMz~vrg1gQXp{ufg`hM_qU;U>+zmvc5blCLSq@PrEBSGR#
z&8=2Z4uXN`F3p73ueD1l{s{k$WipAvSh5W7ABe?4)t;r@V?y`bNB5FvBuE|0VRTb<
zM1Hn^?DSsJY+sX@T5xW=#>T9VEV|?<(=6|ge$X6Sb05!LFdjDcoq*gM(Zq<f*af)i
zNrX<tMgmsmg+`)u<gVRy&HOky#ont<pVW|J_-$wrA`xxK6{hhd+PXR8vNn*oM*H0|
z1qYtJ28e684_5Ps?yhMANn+G%uO1h`$vWv3s;1>=t;_)Le&jyt(&9jzR73noru`a#
zN*<`KwGa^gZU3-)MSLF0aFag#f0<>E(bYTeHmtdbns#|I)-$)mJ`q9ctQ8g0=ET?|
zdO}eZ*b_p>ygRTtR^5Ggdam=Zb5wmd{}np+Jn1d_=M`~P=M67jj})fH4ztb5yQqQW
z^C|C&^LHAK-u+ooIK)yM)QM?t;|<{P;;{`p=BclzAN#JzL4jCwXkQB1Dy{=^KR`=~
zTrr)y7eiYBzSNs_DvO=4A6#EgGS-zY%Vi)N*Yb`U;6o}KR}dq{r9pT5wqZ@3NOE8-
z9-(}D|Nc5732CSYQbL)!gPQ#RbD8BhK3dl{sUuPvei0tkvnJBxDE<YT0)IF6ZR)Bk
z@)a0nBbA1w8SkQ(D#i5&8jGNWcVh3%MMH8Vt0#Cqs{7rj9lAfnOxdi%ON~J_Lk4Vr
zr{*Y)igLGP+Xld7jyNiw*|X1cmPqh_jE+%>AYTesU8H$)g(Plra{VH(v3u^CO1~(+
zU0O7#)jaS4{NcwA+LuSm&VBcX2#Im3xg)W}ySNw%->orn1taZ&+d)}8gJTqA!u|5P
z{yv?zol_3|(1(%M(EVU=cp?L`{Pi|ixk<Zz{d_OJ{%(afPiA`kGm0)dQ`ag~77r|Y
z&C+7i1_BU!*UJRd(^@b?4zBGXgdZlcPU8~&SFU-ec*eK#s8l5P4x$+w-ol8WnhVHs
z<8AXv+lumqmDSsBEq_1%nCKJHKDdY<XS%xm_eRL@MHf03BP@ZPs+4efWYQybye<P;
z!YgDeDt`-=e#48=xgFFnb3ip6+;21bca6@PSyeFDq6U)Bi{elQF$F^{M8$^wE9+h9
zp|0OT-Yl*F^H*Gl@RJ6Ygk#_Hwne|c{O*=S8hR2WOY7QEb^oD<fAVQQx1i_#15%F~
zSB12atfnDt>{U)*guFML3P!OSlz;zGA#T+E@8@cgQ_mv1o7RSU=Zo_82F?&&2r;WE
z@wk}JHYEZ9nYUc(Vv~iTCa3u8e4q(yq<29VoNbKk<beKrau_(DO?g1SPxl?tXR8kl
zm@B7yS{4nzYa-BC)B<s3ZV|tCLVRY=S6W|%ltS7#@=YN0E{Q~^h`zp6^Ds5_kY-c@
znjlqvzdNqVg-)ddJh>|`mq%I6u)My=gPIDuUb&lzf4`M<g#L>EA9^g8u<af%@W-r>
z)vp8|$$HE9m_BTV?lOosIGa4jud=jIbw)O2eCMfyw2*S8?hjWw^nqws$O*M$3I1)x
zR0PWFb3$ySOcGTe1dz%N0l;RPc`x%05FtT^f^j{Y<u?Msf@VVK=mBY*;G{h}T6alh
i;_JuyfJ;~Um+rnc{a6{0b-ci|^HsjhJK1mm0001WTfUJ1

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..9287f5083623b375139afb391af71cc533a7dd37
GIT binary patch
literal 5914
zcmV+#7v<<uNk&Ez7XScPMM6+kP&il$0000G0001w0055w06|PpNFxXU009|=ZQHgn
zNOpH4`X7PzLlv+kr;~&u5J=ize1wQdZAZ0jE!n|crW7E6^)EmH0x*}~^%g*95fd;0
zmbPs>CP}*Q=lvp4$ZXrTZQHhO+w%wJn3c8j%+5C3UAFD&%8dBl_qi9D5g8fry}6Ev
z2_Q~)5^N$!IU`BPh1O|=BxQ#*C5*}`lluC515$lxc-vNC)IgW=K|=z7o%cWFpndn=
zX}f{`!VK<lX!FLkOBT(Yy<iOh1h{fQLjvrOQ%*nbfL+k;)$>02_kU+Q5a3m37J;c}
zTzbxteE{GNf?yLt5X=Bzc-mio^Up0nunMCgp*ZJ;%MJvPM3QK)BryP(_v@ei4UvHr
z6<pZ&&fAXylxW0w%M}QG@yfnPZdDXqIP=jPCIit7o$6lTs5icHCgh=N0unBI<zgT*
zptDoN<h$R!7(%ELhKSSOx&nxS==cyKaPf0zAd!_(MC|v$-B2pfoh+ho#bf&+hH7Al
zeEH1*q<}6i9Juxt0<3_LaK_h8L<~fCTg1Svrz2X|nM!={Hli82o)&S!Wq@^;;Sc+N
z{~W{um1qFY-5<^_GW?+R&A}Lmstggwo`S@#F#weeVu1=JFn9vNE-}DWg2+&<6b|VD
zyTX820Y3(!YAs>+sbCifQaOkL6-;5fL8$W($zZ_;CZp305C;~$hhRquZr-r)jjd1z
z31%ZK{-(`P#|Um_Sivn@p$-vz46uqT>QG0B1w9znfS9A8PB2LaHdzA|_)yjXVR*l{
zkcu3@vEf7bxH0nkh`q?8FmoO_Ucui*>_a~P?qQrl<J8L#kWCf%Wh%yn(Y}gU%LfuZ
zDk8@t_|u4e$m`1t<6z}J_rs7?FJ3+<S^J2$5qt6i-|juKZJ|8nXar<dxa}M-+9i7p
zv6dS|d)2&6A)dFrYRGP(%Pv>Z9@+D7%MTpSnztpylXrt5!-k8_QPB?YL8Kx_On8WD
zgT+111d(Op$^$&KLAN5+@?>f7F4~wFi(8TL8+szgVmcMDTp5l&k6~=rA{Dt}!gb^r
zSWY<)M7D|Z2P0cEodj6E42PV>&>DFmQpgt)E-|#sSUU@uKed+F680H@<;-x{p|n<l
zp8yXzqfa@7p%sObAm$9h$>uH4!_mn85rx>wz;0mPi2ZkL#k6;sznu?cXh!T0S>{w6
zL^gvR05NY64l*<+_L>On$rjx9!US;l;LX6@z}yi#2XHh)F@Oo+l)h%fq$v}DNmF2>
zfs^_t0)3N-W<9-N?uedVv{)-J0W5mh#29QM5R5h&KuiRM=0Zvnf#lF=K#WlCgc#9c
zS;qvh(P$<N4^0H>!_a8JwyhI^ZJV2k+B6Z^64?w|1?5gyo6y{}923CRZfYVe1#?F%
z7h2SUiNO3;T#JUOyovSs@@C1GtwipycA=*x5{BpIZ_#GCMuV8XK=x;qCNy{d7?wA~
zC+=vjls;ci&zW=6$H~<UuGL>4^K%v{p}Ab?U%C6Z4p%eC<3ExqU$XR<<U%Vkem)I3
z!`%PIvLz&ze?Zp%vCR@%m16n3hACIF^0#G_T7epA#z)8(rvE?Hg_ap6_uYP)Tb`)h
z2GK)|(Rz!WpFyU@LzjyfQ_<i5^lr&=M4!BSy(W%@)>}LLF67A$Sr20DR_pJ3yeBa~
z^sw{V0FI5;UpwXsScYuhbqGQ`YQ25;6p6W^+tgL&;Ml;>S3CGpSZ>VrTn0m1$y$HU
z&65)I!c?oREz};c=nLCliriqQX->4uivHTgd${GqeAlf*!P^B|jkU|*IdNP(&6C>4
zqOW$)Nw9nvjy^&`?E|gotDV{JmJ9Q~vuhy<<G@fboDAhcI5Dsk#+9Mh1`k6v58TQ6
zTpAxMdW*w$2XjE|dQ{O{2;)nJq6kNrSbdZo9zqeu3!nwqzHn9@9s3%Bu@kHycSZ(x
z2&|bA;|GSCg#oDAgn`0}Ky&~=v%vnTsQAhK3}!@Ul4k5Hs;%f_FcO_g5=04B6;XmB
ziOvB@G`0e&A^~64K@uGVfFy@D!BjmmY#Jg-bQD1l@^zr9M#Q=#5Jcz8rNs%Ao0hm-
z=t_Aq%~%e4bvUtd2FypO;{-_wnmGsNRpExY)16Tgh|VXVky}6f62Ys$1HSxdlSZOT
zCCO8y{_zPY?=~0l+26%7xde3w04W9Q!Is)V1LkBGNde1$zrfW<NfNr*%|ZxRzT^JA
zmcTCY6tLybe{jSY-O=SD&Deu-#rFFZ=3p1N3e^Al)6K5on3B|W0L{#5+PrG&o;8D$
z9VJJ=wm<!FVPYf3<lcP%LC|1@)-Aw}12hTj5D5k>`^C4XIUDt|j4o6rK^e8_(=YqC
zuaR<q<0Od&Y@7Cjug_237^*j7a#aQ~lCq#UYtH7{U_qlC0^1@%Gyuc1|NWO)TNBEm
zMx%@_RIC6AfxhnJIcv(^$)J&PfGr82VdUyLpZ<4xbZB^JxZa4#RXG^p?q<@OkN-Dg
z>6TRVf@tUFHB079o4MBIh{M~4>WwnGgesQH<tZ7VZHqtr^FKfQeBMqw3{0x^1cRqW
zV`%gGwJVk_Syh(=#d=wm^?D<@gsPV0$vs99^0;ZqG|-B^-cjnq$sLjec-e@tEK@9#
zOQ>*3?w(RA%hCZ*7)b!aNV=yOQ%o_Y=<Y6|>Lt0Sl*(9^jfRnC210Om$=y>*o|3z}
zAR&vAdrB#mWoaB0fJSw9xw|Am$fzK>rx-~R#7IFSAwdu_EI|SRfB*yl0w8oX09H^q
zAjl2?0I)v*odGJ40FVGaF&2qJq9Gv`>V>2r0|c`GX8h>CX8eHcOy>S0@<;M3<_6UM
z7yCEpug5NZL!H_0>Hg_HasQ<COXdOkdH!pu@0dU6f8zgSJ>GxR`rY&Z{geOy?N92Z
z{lER^um|$*?*G63*njwc(R?NT)Bei*3jVzR>FWUDb^gKhtL4A=kE_1p-%Fo2`!8M}
z(0AjuCiS;G{?*^1tB-uY%=)SRx&D)pK4u@>f6@KPe3}2j_har$>HqzH;UCR^ssFD0
z<L=e_ckEC4xZ1K8&yCddum>7h+VLO4o@_Yt>>AeaZKUxqyvxWCAjKB>qjQ30UA)#w
z&=RmdwlT`7a8J8Yae=7*c8XL|{@%wA8uvCqfsNX^?UZsS>wX}QD{K}ad4y~iO*p%4
z_cS{u7Ek%?WV6em2(U9#d8(&JDirb^u~7wK4+xP$iiI6IlD|a&S)6o=kG;59N|>K1
zn(0mUqbG3YIY7dQd+*4~)`!S9m7H6HP6YcKHhBc#b%1<GJDT!^vq^Fhq9+GQ)rw<7
zX>L}VIisp%;TckEkcu0>lo@u995$<*Em;XNodjTiCdC%R+TX|_ZR#|1`RR|`^@Teh
zl#w@8fI1FTx2Dy+{blUT{`^kY*V-AZUd?ZZqCS4gW(kY5?retkLbF=>p=59Nl|=sf
zo1Pc|{{N4>5nt#627ylGF`3n>X%`w%bw-Y~zWM_{Si$dc82|=YhISal{N7OY?O`C4
zD|qb}6nLWJ`hUyL+E>-;ricg9J@ZNYP(x(Sct&OI$Y!QWr*=^VN;G3#i>^1n4e#Je
zOVhbFbLpXVu*16enDM+ic;97@R~u&kh__kgP#!R`*rQEnA+_dLkNP~L`0alC|J;c;
zeiK=s8;BsLE)KbG3BD&Br@(Ha@SBT&$?xX`=$;eeel=|R_dIr6-Ro?=HEjnsJ_b`1
zK6Yg^-6;^2aW!xeTK)A~3Rm|L^FCHB_I>jIju7ZGo&N_1*QHkxH2!<tj^laH{Fyx_
z{&=J_f2vo<Z;k$M1Ir~ug1#5Ga52L4CpFf22cxv6fws^ma=KG?212=Y!jNISS!|Lb
z8x-AF@-9``d4}WvRUse6F;u?af>!%@o4iZ?vntS;&zJdPe1dH#04YD93A44o-MpfD
zP{rn_aq>U%RDvC2+bp;xPlsOzauIi3*Lf42`jVKK<K1>ZCRuKdYhi>FDuL<yU&41y
zW;YPPNe&8L>2l=v{$BCN#<T4EqS^BZve&iW4$t~r2^LU29B#Olvb3z==K0V~Xm$T^
zTEZQM|D{j?1st_dU8g^<gdhmv$NdVXjg~&|9i!p3%#sZ~>Q6796s%r-AG$Q^t(3c@
zD?w0UhYr11@feiyl9kY_@H8~|xlmO<8PfQmj1!$@WieW@VxR@Psx<u&LsG06B}sH+
zIY;3Fh+6GQ0@)pP#J1>fe-v9WCi1+f>F4VL?0O~K7T?m4-u|pSkBpUJZZe*16_wAp
zSYZ@;k`3;W3UHKUWc8QeI}0jH5Ly=cGWQPw(Kr2fm=-5L(d`lcXofy8tJY3@Tuadz
zYWXR{mW7XT!RF#RVCe%}=tM*O6!AD3^(!8un~opNI%Uko7$5t@<8+?<JTSi(aPRTt
z&Ml{N#KaBO+?nu~`4Q07^34=s`MzQHq<x4YOM_H9N_$hsJ2<doMH*MCk}b~+4UINa
zTwL7@3kg)_0*#Q$wrCkv#2-q6kYzsssFc?p^mKPeVprz0gBMiUOMbNTyj3-qRER>;
zTxDys(MyyGsUjtSu9$+|_-t!U3fVb1dkK?l`17<+jfl=hrBHnDSV>^R1=TnQeyqbW
z>ov#l%!1|S!1>8UUxIdhQq`_klcHVx0{?#>K3#$4GlXncwldt!g17TcvKq-jo_996
z>oA=tH9CqRl6Yw?Uc`am!V?lHJbizOJaVaScf1U<ZufBT_PTSYy1Kz}Ee^oj{1{Jb
zPK-`C@vPnz@)Il&x&GuPat%?B<3q8e7{hr^F{nmmxEn(YMpk7=cxlRqBY%WZC*EF)
zEGiQ`?WYSV^pF##Wu_nvJYxLapR?xP7cc)>P5e7Dbgabq=b!B~T&_F6?ooU>w%x0A
zH~&MHJ=q`fCH{U<7MDXE4SD32cDZA)WJeWkllJ`UspWaS#eDe^kg^oU_A14UE9<xI
zdNHNo$`_W}M5oe9+e37~{LsytH8U$kdU4k6Wd+jywdyHFMcdx1=~?-!S7R)G4c`N&
zcWK7v+_<;uHA$qDdzdA<PssWlx07Z!S%-(-yIKQguM<#>zG-a^g{xaXf$})Wik>gT
zl#dkzGr(;h0JZDuFn(+k8wNq?PZ5grQ<+sM?wBGt@JnH6v0#or-5wBQWKU~(S_&GT
zkE!tc*ZJ1Y&*p(xX84POb3cClR<n^{&58_5a3*@tLK%RDE@eA8<N0urSMl|?a*z{{
z|I<QGAb!#~MTWAsI&lS{s3^f%*Cq>Md!^qJ#CAZfIepEj-<`VURS_yCz0(?*Ixcj4
z-!zV1_QZhpm=0<;*(nm+F>T=)o?ep@CK5I%g^VAA+RB25ab?7)A~z~egru=I1S|@v
zH7tXV!0wmGS^qj#e+MY;C5eUjEAp$Y?LDkS^QPZ}8WN85?r$u<-Epi;yZ1|J2J`se
z$D6DpH~2F=eI0B&=UFAUnJvZAmClJlK)sutJ?M>xpZiWV&0=G4MZP+x+p>EX=HbCz
zxls%Mw?*u^;LbHWIWCyq+yi)`GmFn9J112CZda_u@YIP%i;srFg_paU02Ifij*7}l
z&CF-<n3E$I?p-QiuZUl*H!IK>(3|>*a|+vbNR`^RP=9G?ymEJ0Z~)d&c*UE$UMepZ
zcITr{0WqhxkjUnM15js_gW=e3Uh|y6ZReaXHIz-=p`x5VvB&rH9y>Amv@^WmXFEw)
zQXYrk3feir=a{jM<dzdx8r#aUNr$FCI&rFeKEw-pN(>Q+wDIkkFnZ$k{sJakHn*?u
za%4b!00ev8NVLM1TY=cl?KB&55BY_MU-sg?c>=Dbz_W{(Z~c?HJi*XpYL)C6Bd8WH
zt+v-#0&o~@t4qESi*)+eW%@VD0|o^yF)n0hM<gQ+PU=IAxwIpk4S6|%K*&)iTbk{k
z!-s&ZD6V|2;CyHbFJb|~GXX8L?gXzy++xyz?IYV^U-~qRctg`i7PNG+E%K%rj1#lA
zgkh1rUqL7W9)e)2c+*k8wKU)vO;P^cyrum5UZ1j=T*)ids=e&KO0D*dbQW0q)Dh%0
zC(oDisK58>E$UtXF$*Lvh}7sso{`|pn*JDIy5^Fm3s$5*zEE=?u5<=l8FJc3r%+H}
zdfoNl2J0^~!-*mOL5o-x32|e0Im*E!yY7F7E5N)W3>+v_LBydlEx?4$RL5f2oYRD#
zaR0wv(-p~wO0eLDl3K=%`{5+0Gd$ktO=W)gWlGZJ0`K<b1^|j#Ha<G@=c9EaeXzf;
z&txt4&rY=X-H_Lvj5eR1K_rweFrPjPyTDZN(Ek|onY%1(4Crv_P7LnIj49do8Wd;~
zCSHo|+D@^-(re?Y49f$%^lZgf&fe1}InrGRWcnc_=giCTrGmDRo?m;MF<+&2oxsg>
z$_RNA=ckrfa;H0KA~dR^p&#0(p-{x$&=IACIfoAR<Nvvu>!za)F-^da-t3#0Dycnp
zwO~NVXwXCl;jE<}>%@<pC<8PLRXJuvO4y>xz|=8fIJAB?>+E{7)|4l${4ngA3G|=r
z2Dyv;VVWSgZx9Wj>qUjleGl3Ei9K4>h!(lPS%8VOG>Xu0%6VDz^O=bjJmuP7>DeUv
zrbI}MlHB^^d?{zv6d=@_ZD2lg1&G7UjnVN{1}9WkaM3H~btX0GtSzB+tZ^qRgWo4m
z!GmimlG$=wgXCnr6j@m<1gAL46#T~5<Pf8lV3fF#Ppu>Bnm=2{^@>|t&`9mkEPddj
zAvG~@Tv~TAm2i%VW}R-g(Z0)z-Y|szHr@rk>4MAyG*Ma*7Yh#H7(!-5>DZ@8r;_dx
z{prSe<>~099F8vsYd2xff7uAS%7{S)f(|@me3t2$iy&NEc7OUEchp@<t56P}GC(ea
zcuDo~%vU?vTqAIp#flPNMI2`5;t8&98^HQwmsAsoPNK_fx^Bggb1WVe*N(RqH8#4x
zCQN_rN*!W39u4A)O3%B(eX4-oO~1PlFj1j%A~@Wj>9A|X;;IA>8!oX+y(BKJ$EzV*
znR$z;!L$s7uy@{OT~nG#B!NRraT8(X##Ho!0r_o@gg0CA-9H^;-uE&?$2$nHv_00o
z%cbuUc-tCx$Uh&EZ4Nf4Zgqv)Y6>usG3>GeQnxx_Z6+PcbX-+ysbt1hQ`K1LDpOE?
zrAhIZhSN9yVIAOa22gn577tbc&i3|3V8NWy&!tw##`}9*x}gtI^h1DzZRA>UuaJG)
zaZ7j)dq!O}{?#8Y7~7i6fHh4{`<bqO>pL?>-18|p!S75Y#^DM>-S3)vuZG+Q7l@ek
zQP~#cBpWgg#mApc_sPYjpw8odQuRokmTkzcNl`^CcKB7e&;zViV;{Y{o^Y$%7i0m#
z62%#1Lq!RC?}lK>%mp}T!3Xv;L*0v*>USLm``N%>w>@fwC+#T&Tx2bN4w(20JB}oU
zuSa6v^kXi0x<PZ)*(STjdw~o@Rw-Or2Ax?U!if0^R7qyS!JDYN`R9pBUrvi5{Lgqu
za0I&Zd*A54UPC}|lJz-2f!1VYM+A-ElFU{V`W)8LtCk5Mx|)Il)$QH*gA63%JZFt}
zGq@eFEWXJ~R>Ps?pbaOHnyiqq6By1EZY9OZ^^QA>{q-Hsd&m`pbQ%8121aWG-F5xf
zlZ%;B{;C>X19|`^_?dVyCq>n+41w7|!tUS!{9rHlbhX=SZO5CQ^;!Du_E7*`GiR^Q
w)2!4MKjfSAeN<ek%udeE*tim(T?PyxRjZ^lIknLzS6Fbvpe_110000002i;2E&u=k

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..aa7d6427e6fa1074b79ccd52ef67ac15c5637e85
GIT binary patch
literal 3844
zcmV+f5Bu;^Nk&He4gdgGMM6+kP&il$0000G0002L006%L06|PpNQVLd01cqCZJQ!l
zdEc+9kGs3OD-bz^9uc|AA8?1rA#x4f-93WH-QAt;uJ6U6Yp<>o!9>IaV6aUZ*?W>}
zs4%E?srLW`CJh0GCIK@hTkrW7A15Iu<z{NI>%N<!nR<>&?Q^$0+!{Tv&|t^Y@u%!L
zglTg&?Q5q#ijZ;&HBQ?FNPp;k3J5!&{^+SGq<pNwB|u%pA^-t3!%mrgTx*^S#Zw_4
ziE?C>?AX~SiOM9jJMRpyP?RCr@z38AQyy&WRMaC;n4una$~nJKSp?q|s8F00c9?Q!
zY_ovvjTFm+DeQM^LXJ#v0}6HRt3R1%5PT*}W!k8BEM;Jrj8dIceFo2fhzTqaB3KKk
zGlCLI)gU25(#u6ch6GeB1k@eHq7l{EHXv0n6xE#ws#ri}08kkCf8hUt{|Ejb`2YW*
zvg}0nSSX1m=76s?sZhRY$K=3dpJ+y*eDULGnL2}4>4nvW^7_<~wIM_5fjvwt4h1|g
z)g0Z6ZFq9j<~9~b8((~TN{Z?ZQfw|is&Xp~AC61sj;xItKyCHdI|tCMC_LbXF>~vR
z=w6V3^H=W4CbAgR4#xw}ETTwu2guW~=Crl@SMXv85jQ=%y!s^?m4PI0My7MWICO;-
z175jm%&PcPWh8QdOU(#8bp4!N7ET-+)N}N2zk2)8ch|4Q&lPFNQgT-thu053`r*h3
z_8dI@G;`zn;lH$zX3RzIk`E8~`J=BBdR}qD%n@vVG1834)!pS1Y?zVkJGtsa(sB~y
zNfMYKsOJb%5J(0ivK8d+l2D2y&5X!cg3BG!AJ}910|_${nF}sC1QF^nLIhzXk-Y#x
z0)&1iK!O;Og0Ky!;`<M509H^qAWjSb01!F=odGJq0Kfn~F&2tLA|W9aSuDUH0|chv
z><Y$E!fyEcTj8Iz{FnTW`OLS!v-~mg2YDxGX7|*O>b~v%b$`S4E&fB)1NB4v@8wr(
z&+NX4e^&o)ecb=)dd~C!{(1e6t?&9j{l8%U*k4)?`(L<U4S(5x*nimdWB<W>3;Qjw
z#w7FS+U(94MaJKS!J9O8^$)36_J8;thW#2$y9i{bB{?M{QS_inZIJ!jwqAbfXYVd$
zQ5fC$6Nc9hFi8m^;oI-%C#BS|c8vy+@{jx6hFcf^_;2VRgkoN(0h!_VSGmgNPRsxI
z8$rTo0LaYq-H5i&<W~DEeA0J5Ejcr=NoEuyBL(OVLAcB}X_8cB_uJ!s7cp0dRqVBe
zsUE`ZT_vw`#PhJ3GZL&MgceBX?CZld6L?=CALkxMG)wd*K}0qB5G);flh~+*<#sdk
zHVpiyxmjf=)gVwD(Othch%-?7mJ-JFN@GgN5H*j<vXzv;;EgH@{<`xp`bGWxdTuF9
zVfPw2|Mb0|{SR@<coJRz*Ldo7C8_WV2F~CA|MCG$;<8+wMv2K&bEOiLe$h{|mYTns
zmq|q&A*1?q+ixKWAASoVH!ZEVh`i*LG6iiJkbnUG@aX^m02AN;)E{3iDq9o+QQz{^
zE>gtj81=&xU?H-Y2==G@uQV7E`@+2E9XQW@{&j`?EOktk|Ho{HU>ZqDzvgjwBmdex
z&uZNd2C1h{{}2k6Ys9$*nFP3;K%u!MhW`uZy7Sn`1M1zs@Es&;z*Z>Gsh@-3Fe6pE
zQD2@cqF((NrRevgvLsvM_8;;iNyJ5nyPyy?e!kvKjGj`6diRFBEe49Oa7wwkJFV7Z
z$YT&DWloYu-H?3<0BKn9L&JYDT-SK~*6c5pi18P26$JESKRYj{T7Zk6KiRJcbv<El
z9J+CwC&)JZ>OO*{P56Q6s8msbeI3>|j>K9}Q9UBeq*inXKemCm`-<5|-$ZyN4u$(3
z&HcvqehFD%5Yrmykg-^d`=BSa8(i=>ZoC77^mWY{evp(km@aHqhUECBz76YiR+VYK
zY_avFC~V3$=`6C4JhfHAQ@DZtUOwH`L;oYX6zK0-uI^?hS$ALfq}A7evR;ohJHij}
zHSZdW?<e{2-WHa_?U=it9}&7kqMpjq1mSDIef>EKv9U1s4oD*<(0oQ*;MaQ6@cvGL
zuHCPgm_NhVsgp^sfr*ia^Db}swo1?O(_Q2)y+S$CBm+g=9wCOUPbz(x)_GbaKa@A7
zuI&!ynLiZRT#V%_y_-D`0Z5lT*auoe{(U5NylTzFSJW()W-#F6*&A`LNO1bV#Y;QJ
z<awv-I3PIiWGHhTy$}zF2Y)1sqQ<os%Ovgx8Kp1IIYp8yKG??*Ss|3D&_gso#&bcG
zAOx0jE$6M4Ta>SbLBnp|B^dtK|KIWC|No>JjWBWE@n7O)x{&^E(WMeMvp57#qA8m*
zeTow*U@_86B#Fm*rxyYu5<KF&LxRTn#b#-=V+wrM90aLp;^z%k__(dWQ)AGshK?G2
zG_7TEuE}qQ1p|pu9cXTCVY1=}eY&5#0^oi_6WJzXND#Il2{P2*Glja>PRWaWHx8y>
z*qmHEp(AMDl0v)ij(AY8fnH=~ZwwjVAbu*m5;xPf<qJX_d*%rb0I5H47@IVnb7S0o
zz2PY$`9p9<?MI}^fsvg}<5vnkl@iWSyJE|RKd<CD3n(U@+9y@s<I(?>idh@ov6d8g
zfJsi&!QyK53Es%sC39ts;54V68koALD4b|%tNHW0bIkZAJKa=W&FomJSEDT>W1xIX
z1x%Z>AvNIsSPLcn3RTcHXb@KB?cuM)=x6fcIx>&(GxqZ8w3p#jJ(GVgc*`c0HG}dv
zIop&Qim!K1NFwic%07KcjWgHBPUkq7f~lj;TPqVGTiT#cUeim>;nY`>h@a*S{qQex
zQ`z62WK|Mj)Y{tfF{;T4<U2X{`x?}US~MrE1C|_1&};NNy=Xd=->P;c8$Q|KU?Joh
zIk<oAxu7<8J8_((U}1AcLhLHd#;6?=ujo!ltdCtw#~hyreNq0TmvSJC6kvD&I97fd
znpE<a3v3nA{>A^z%X7z|r>4aTh@|StTi!-r1D!g=zb#3d#{{&K3CqE$Iz-UH<%37c
zRfkO`&uM%#AD3PHv`g5t0e^O%nVL0d{Xlx^EjEC3#skF@`zl-7PF^0oxW)1!C!JxR
zWvuAHH?)61FKA1QeT*_sY7;_Id#!GmV4n`<w=^Ck{Y6qCCnK=crd>MO{~sv}VLSK`
zXRw=Y=Clz*00B(5y^K;gCZMAzjT5+c3IC=)l(9VIDdatpxj3y89WwI|bH&$!ZEvp`
zPR!T@#!(|KfI-w?!&+7$N3F6>tD{YO4Qg$d_`nNEdfVCha9vaPn0jI0`)`@*72hq!
zp<q2y@kKfVrSfb}8vmw$SopDtXNL>U5ND^P*RoEkbD5o#az(-g=Y)L>HH>O<qeopz
zUN9W@%YIO|oPuhw|3vc#<KCMY=x6o1bq4B(<v$M-V#@J4x8rW0u2vp3d;J)Q>c%}$
zT3Rs_ih0;4+Lv4Y;@Iv(;fUbQ=i-G(#>vghec~*j(I#r|5mqFiJBpzi&hzEcD{u$<
zRsm0BVYn=pT;0>R(itW|*D&;O%bOc7et9ACaH#J>z3A<mlHC6`?wC3cPj=a+0L!KJ
z29dbN4hGxn(vG|*nDvH_Gu%A>1A~6fdP>pmbM%xzm4>|;c_?B+%sl;Qs2{t!60$^u
zH1t@9^6>;?!FuusnISi$f5CL&;z?EqJN$FBuWDA#D5`cy_UvCFIVvf{c?4N0teh;d
zET$7aVbj08KTQS!x?Nd1Is8q8qFzs}a=!@nJ;7FSfCY^T@D-gpw`w<6e#X3+;O}1h
z$%I!M)0bg|EKUA04Qjn@+x{Rj8vt6Wn!R|3A92z}^$KfF5(#CWr4y#~re1CN4i4w0
z#GsypBR<e;sgowNDv$gUgnDd>{xA3Er7sgAi(|}1-W?s~n$7?K|9WL8kpVfw-;#b9
z+mn;=e<xV2z&$aXbbB^9!5xN=DIomsyx0q9u03Cg{>p!162U5R>_t}fOt~tE?s#m(
zO-S$7>Ay6*hHdZ)7_oU915WYYCIX;hFI-U2EWYX!pllONr@Q--2o~`!<G<U!Wm!i6
zcOe$Xm6I0E(yJ$r-ME}i2`)znbXd1p52N%TOsuKK&9}G3_UznkOzVC5f5D;nCf)Z+
zj#uVX)+?#DL<kaNRk~0wN>isi6vTPLJ4@(|o=<RrQ3C!v$5WYUUCW7tGYI}Ga=@S6
z#oVDLA^DrRJ><U3UOnQXJ$?>%NHYjo0_S&q*UQIROw@*N-By@P<Aa>aQ&;YxFZ0aR
zX&}LeOEz);#m~Hwm^VAY8DK}b$F4bo{jMN?d!lxKPhNklzr^Cd`0f4oJr^z=I|l`*
zm8AHm*fPV`0=lF3Pnnp}&J0N1X@}-D94YvmUabFrLGSnTz7Mu^21F#O5tN#CuY9Vh
zUZBH=ez%h*wkf0hBtXJh1SN3d+IF{gzT7lp)j}n?03lt;XSQRAh7qd&v;RwTYDuQ#
zbI2*r<>?x-G0@hM{;%{VBD7nLKt~D`T~-HAt5;h%i0_=Ifs=yHma5dhJ+QMG?Ux(a
z|E?1CMy1!~oA`FP!k~iG=t&5#>bVdz=peT8HMB6Y)#7PpETtNryT^+Rv3vpJaF^zP
z{H}0-LyV9Fu21ID%wO9f1IKlFr1p4c{o-?03vyB-tr5duk^&L$;m_|f$vs`^Sl{j2
z95}oY{LlY+=ZS%J+tZoXCd0*sSU7w^gjovXn+g7uyra5{cU49@yHf#Z^Jl-$9cIfo
z+AJuxH$VLb=#+uBbVmUjn<pB8s2*J`I5CyYgqeYUoxo|zGhX;tyDo1a#27aF@cZj$
zgh*)qH$l}mt);}{RwPfX7p=vEVccsmWhYwNX6Is75w5D@Tj;I~X$WiCH;n&HX9}>x
zxb1pZ@-O9=AIk4@S)m6fJ2?{HrNYwwnL3a45muuNjr;6$O`bGEM0T4A2_S$t=86*-
zcO+0mywg*j<MP8}9*qyfJ7GqMnvW0dCHIXpIOyq&xVwY1Hj?9}nQ4)L0000000000
G0001O&w8c+

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..9126ae37cbc3587421d6889eadd1d91fbf1994d4
GIT binary patch
literal 7778
zcmV-o9-ZM*Nk&Fm9smGWMM6+kP&il$0000G0002L006%L06|PpNM;KF009|=ZQC}G
z?WFVnhub3}`X3k)f7gJdHv?Xy!R81AlJ*B*AtF+%2T777MNUTbu9%sbnHg^^{r@jg
z*GbiFHdh@YCSU?QVcWL6ZMJROew>#A4mU}enR_!cGmIYQ;qwfchWtFEXL)AK%*;=j
znYne+hS4EMy3S)C*mZ1KI>!+)0V@9!N<vFAw%bSx)5&s%!VB9)5>6H$Y}~MJ{rYuf
zz^KljIWvFi<cP&X*lv%IdKPZD;Oa}RxZ=WXTQ_f5SBivP>-?#?V@LPR&c6Nn{!=XM
z>}-h$S76;$H{E{Y%@^zlmOl^efBwa%UU+jJD9UVukQ3ti_kH-?H*RC0?M1W%FCvMB
zM_+v6fk$6X2sx)-p~B3&Kl{nscK}pNLM*qjtpaf9>AU{-iPKQZR8yCg!TY}Qg*(;)
z)gdvCcB%kppZc$VdvsK@)3l1{&DG!d_6OHOS`y=ITLEVu`unSKA2E%JD*DVX{LJ}K
z9l>hMRDqxQh0lnpGHpVYneX}eA3Pt|2v%=q;rt)``R|#bDyB)OXY&vI_@|*}h}G?^
z@aZ4_!7cQPX`!fW_?{oT1NTwHs#l5L-0`E|y@48<3Q^HFf8=Idi<poq)!h6e-w-t>
zpJYD%1MkII!~|7I^WGo)IF=?{>ACnjJ_WUi39C}!Q{QnheVJqeKKqq5^o5CBde(g9
zvw$X6^jz_^E2$wSw4!q5*RG(C2_^XO$HBn_55vbl44OnTTRwRaePP0vo{K)U1#99&
z<>rq7V&V(<&@I%MFoN5zrY}sz=(*-L&}1QQ*a%`u25h{cFj===17eB_uGuzG&byQ<
zrm8BJZl4r_E$3k|Wo6FW0-6M7>qac5uFQsQcmkLWGfeH74S3Z_rJ!jgN++!@i=HW8
zkyjI(oPH-+-N#Qc^-mpNO`bc6r=2-<%&Wy5K1vfFJB(L_IkpS6fY^NmuL8qsgj>MD
zn~BHH9WM~32_3vd=W&B)k7F9q%stJx+b_L_X-4zr^LVUMCmyCTA3sWtkvsmME?Xiy
z?xOSfB=_$oY06~J-HcCq&)qcW{j;uP;?Dm}=hkq?zh&n!;m((-G-u_t|6x399Q;>A
zgNpxoJNj{u|MFDH7Rhq@FCAl0dE|ddnl!oh9{Lq?@JDoR6L<VshF8r0_5hVetvvR3
zUa9QP{tlg6#T|cqYLF{a{Z~(rG;8wQAGxkbcBg-f;&yT2caC>;C941IK`ISfdE$4S
zE0AUQ8+2|Ncl_q5QkSp#AODp~(^mfP&%Au@@|TBQwoP`UU+V{6u8|)6ZA{~uKmQ*M
zmrMTDU8S~8Eqi{^v0Ug&5Upcm#y7Z1(RbgZAG8jB$eRwCspQ)>5;U)oGZ&E5aeR*K
z8Yt`Y0$G))Yd(Y3KH}tA4`-_QmNke5hU_|nq=xtyjwW(_o<J(bXz&TLG*KqE+J2b|
zzGMf@yloAVGVyLu8$qUB0*aL7J!IELCX-VpLrK)~9;`MJCx<$?q(odYLqjiF1(aQ#
zL@ODYw5>?itz>B>WM&^63bNdQ)k@-IgDHW*RW$Xo9#R<IvZbwNj6)I=m!3rJ1R1ab
z2r2SX+N#$AB#3}6!qHGpW<lbPOR(BWoXkKL%kIL~nqp#++Ky;w$go6AM8rlKdq5Y2
z(2QEE+W<&V$_+GEA2Ij~w6?iAbps?Q2F=yh2@>zrTrCn7L2H{9Amq|qNg@#eZY=|P
zCoI?2s+L)zsM%WX(NbVEY^`C>lFjIBYmJ6@DKJ0ZT4&F&WHW!dwa%QzOG!?jY_2(S
zDcEzZbz*2Q!43|z))9yOP9X1Xt%DXzwY(3tl-TR=Qb_MbZYRrooh;dYYmS!U<ZgRO
zPVYNRQ_syhy#$k<o5k&9_8xKKcLFP4qp4@lDp|7eON3j=!K=ngvNK;As+}}?A#E=O
zoNvBGL+^hj&C*@-@GH2L%&xby`W!OyNy2U9;JIO(gR%4JUah41RARgoaLwm;(ad|F
z%xacy_j&lKc6#Zp9NA05srmrn7IN_DDAJr$pN|}*jW~LL_UB~W*EgSRr5B&8BjcrE
zSL&UF+sDEEL#oZWI%~cEfLcgL?yQ;STy6LL>_as1(=YVB?Q_A|tNu5Ut&_q3jbfDM
zoFxT^uEuH`nX3*sB%K?GuHUkweYReBwnHqh3P)~`+s3+Tj!rDA1e)8vuBv5J*IsxC
zkd^~b(aGzArj08{>cnzOuy04C+C`}gb|Yz-1avxeWzev3NzcHb<pG3uZzt6%N_M`H
z63Z^ZKqoGZc8Lo{3_x10h0fhGO0|hnn_f$^(nSX^2^uxdKSsxjo4qPli^yf&E(~ZT
z1mV|r$Sq=R+vNgcB?V-eJF|f%of#c231}t2Hhy-ks#-%;>z_&4W@QCr$z3~w=8Ua-
z`;vfG1~BP8CyLb=F7t1am~ph_#|O%$khSJ9%Vtcn)YmpgQxF?xM<vI^;GRAEI=6(o
z!@KAW9tUBYeDbWUR*=;{nzD_?0kAXj(FnJKLyxD@W^C=OI{Dn1XoVQOdR%qPoISf=
z9>^_Vb+5fnpB^W0I`f%X8gb9#X{Q-yJG0{Z56aWeI&zPxnf5pdJA38bM`cYnS#x)%
z`n1tFf$i)W-hGm(f9mde^=X@NcV_lFb=P`4&CI&H=IArijGwdCk&X@uQ$5xmj!~^?
z#$ROCI)V-~t%L%GS#wo@U27ddR`4`3)WoB{R-4snfNrfee|kI8^bu#yDgYqOwas9#
zmcb`3!kRJ`Cr=_tq)8aMt{aGtUZsqwVlj6DgCGre>AEt&x8H_in!x@uwgExIh|-mA
zjdaC(29~CTVSaaF7HPbql&*9Uo8P@f)>LqCXclr}peS7_1BQ28u9PO8Eq1@`l3q9o
zkfKCaO2?T?ZyA6loW<#9_c^O=m<&h}CA!ineAD@=(gbq`vyT|tiJ6#^B1$P;;qax`
z55k&Q?wEh#87niLo*+n4L@65J(Nz~=Ya%7^(miLb(E>A3B@|Jjl;FU&D>o|9#7PJH
z?|a<zSu;Ip07(%g)WPBHm#+z16D28}dg#ALW>go!o;WC^h=|T7PVBg(DAB}72cyUS
zb(f>Bwbr!F1eTCO5fpj<{PqhY5>143p?~5ZA5H40);=@M#MYvrB6gqHbU_!GSY??i
z%s=>-ciA4*zOOZHds0a(kWewZ4h(k8h(ua7HX)Au&mY~H8KY6(_cb$_<O0w_RIGh(
zj5b~uP$jJb+Xd>&fA@QjIW-*heP3%$d!m5^AdnT}`12qA^c@!g3DOwZ5WwE2?)-yU
z!)Vx#Mtxt?FzFTwK!77sy7)sMzUd->w4^bxtpM2j!b1<f<x~!bqtR&8*R*Y>pjgyk
zGKwWGeb4)^zjy{9Es&PU1}gwg?|J#L$KJB7ett9@4M%-nGtIQr0>Fl@8-yh`-+1ed
zS6r}(MeSvgSoFmH*_WPu@i?}!AB~2?;i&IxrkNg~cQ9Som98tcq)k^|eeER|Zl77t
za-TVUc;DNvzVXJ%w52+#weN?+;i#{f#!Oc&z?81*N>^e~ltRS%ZI@lR{rs()HmqG!
zx*}ZrI-EZ}ckJMiy>A^oofwDfC~IH)z8{VHKGT@#E5I(Ll&+MnMCl>~AV7+>Gi%mF
zkU1QlKASdR0B80!YhP<$Ywi0?W2Ux45oPfxv9QolWzJPD^weBfvo4<Lv~8xkBt=At
z1tlUBk`xLcfCSQM+v&`#3$kXW7iH=TEsRjnVxh%BfWeFBVy@2gLQEqHp@pGPNU;b4
zVK9rNold70VoXyCgwUc$LP9JwHn#Di7=vk2fj|g>SONxP3<lG-Vxd@6fLYWmG!qwA
zP&gpY5&!^@QvjU-D!>5106sAmh(e+vAs0GboFD@PvNs)jNPvarhW}0YliZEg{Gazv
z+JDIpoojRVPr<*C|BTq<`6ga{5q^8^!|0cxe=rZ!zxH3%f5ZO0cQ*Z<^$Yt2{|Ek0
zyT|*F+CO@K;(owBKtGg!S^xj-Z~rga2m6nxKl9J=fBSuNKW_dLKWhJKeg^-Xe`^1?
z`TyJj)8E!#>_3Y?uKrwqq3LJ#SGU>AzUO|6`nR^u&3FNN_j<GeeqH_3zoS&&2>GOc
zw)Nw`wr3yIKhgcee6IaN=ws>M{6677%)hPwx&HzC(f&u~&)6@b2kNRzBDQAP0*H73
zq%McOmRk{B3i47qRe=DA*$&odrbEJZ*pV9XXa&p@wlW~@Yfs>V{yiTtplMhgM*-Bz
zsSnlq&pG;z0OUN%$~$<ZO!D9T#`!1$`I`)uEDsTp3AbG(+{8$XAm|$7F$y3bNSK&o
zhMQ9>3=g1UF+G*>+17eRbBf3=y79J}KR8owon@$1Z7MIrvvWWH)34nK2SD)GsrJ{l
z1Cl#oVo3A8qY3e=aF)qzms~FG#2$LzT=gs&aVMOj>(%{y<&O0cG!nCiESl~x=^dF{
zKvj8F1K8Ng171wwM5Fh4KoQw`_c6#y$(5cAm7e}~nJ#A*fx+c9;y#&W!#VukR)ugk
zK<lHF5iU?+a7q%LIY(gu+6HC@fZla2JM0Ile!_1KZv9N%EWfH8UHOSr(*_6U#b-Cb
zai)>p3=+;Ut+IYn%m+r4d*<`L2h%aDnX5}^!5R|H;(34AoVWjRx(msBZvk;rCI*|~
zdOijqI@9Z{Vu!~jvHW{lBa$rnl4+!s_5sfK3bCGk-B%iDe&@-}<f8H?NUz%;&9H88
zKeI&VsF;x;0RI0CWD-A=n<aDIbr2zA<Y!3Wi(DHhnBH?R)$`P~*0>+%fOKU|(9?V1
zHE8&@<R$bW%n4d_;X)D(J`BN4--OoA!GW*A7BtPjaSmp`zgPw*Oe`>4z)Kx!RAvAs
z!Wic9=o#(bg?kc-G68-m(jZ`^=XGUXb)}t(%&~sjFnV^sEX%hSy6UKC4iOhgV=BHV
z2w`4g7Y=s#Vu2B_?#VQ|hP39@eArgfX>-0S+dd&^mx0*wp}>)x;c4RUgxz%;oNe?&
z-7-lJ@Y^2^C;=qJsxx5|xF)*pTGhch2B&kxtn;f!7=gznk}I3}Dh}(CoMX<eGe%cp
z=v9i^xLO*DOYAZWh--Ne8Y1JFpkNLk|K_#vEpqOoMnt%@<hp8sD_<1p5We4-TpTv=
z@dBVR@NqKZ79EWW+IW3m@25-^MwFGYc|3Iaf{t{r;5BIY87t(~JYkd-!RZM95t^|g
z07?EzPs4Z1gIL&LXZM}_wC~D}fm!$9AF#Z|NLd2|?&*W35Smz$R&Hh=C8hAKESEx;
z7UL1wsQ2@>gA5-p&kS2<sXj@I%7<}I553&2vzZWIw);>02!l?!fT3t|HG*rIP~mS*
z$Wjo}jq3}z$Qq!9yrtd3fM0N629ZM?<L02(oRsk|cKnS1tXi7sM+ObQ;AZLyiGDYy
z1RgK8pSjl}{cQh;nYY)=9K%s6{tG&%9FL;!g~bmGX~a4g!n&7zzE^gC-I1bT&W``}
z66$KuBZCs7b+dQQBIP@BJSdX=5219?|NB>LU$nv@Tv9b7I;D|;0H2dsA~g7Z7zp1|
zB)XmrkMgF6OQr|R)HHD^TE{Y#j!~SR?b`Xt3Qs`B+x<<kW!i9<O`?sx%JHr)b{N_2
zsIq=l(WQUySmI-3X^7>hxexYeAjMUWdZ-*n9%(1)Wb(n2U<><7&9dwGJmrob)4%H?
zlQ%z+L-^$dFhhH|@u$%97Qz?*Ynh2VG@q|?8vY&L74&fs&_b&3$x&Oyjl~LQDRRap
zJU4U*R+(2Dd!G+lh8!V{<r1^+GAeYtGH~*MH@9IPqULc;?zD%ZNz2PCP@GD{4SECK
zPY*^?z2ea0Y)plNuqxlsmeQ^&V)zAS)RXazR|EI17g$lgY~r6eW5A-QFMHbn4F^J8
zK?Z#1jQ&ia6vN5$+;lZLMvOdX!IncZ+^BZpbtA`^!X(k2teqsW>pT_UJn+^1Qg6$`
zqkNm(a#hWyc6SP+p5=C4HL8-m`pO`5o~`-LI?_h5CsH?F_%?nDodmz&pWR20WTpJE
z?N|wSzLjMUK8E)a2tI}Lf<e1!ycmj;OhldY>;+;*M|h3Y(U#>)g1>zk9|Hd}oZAa2
zLYBWBoSW!Ts!RwXr^8h+U*@{9{zqS^iH)O<vJb;bYH<NbE9~U+1jXCB%D6D6++2OF
zC8hT}ItR8a8Ks4QSsg8TAvp2qTg7+tOXd=rH`PP_B@#$Ony(BV|E}YZJ0sKl#WIN9
z;n_@S>p<;r`Uw~nc}<^$V~_i%$GFjaG?X1@E|M`h)nekvFKt`Dh-f>@|0-`Xoq)o`
zx;JmzDfOV9qCx|EVpogEe0LK~tGS?5$$L_i6P$P6wIsCQaP_;d{{N=iV@+8LI}o#(
zvo*Ejy=IIn{rdIQh1&q-{EuohpVOjJ^Q3lD*YTp37$^RRgn8ihpdu5{Ct%5-KO!VL
zcNB6dUajXI9jkm-P|i3~GB-A(X`P1Oqqb$tcku<Vg(&6)R*R}%pmBmf#me#Ed}K@H
z8>)UJw0w3GeUijb__#QT4j%64z%EeB7S?jlWwx_7&+EEvB|6N=kV}DwnyAlX=?j`)
zmU#!$*^@NIu#n_d7;WoJV@*Fbv9|yJO4;n|BNF2xy(54RyB>t~8lUOUW$&2%Nwi1y
zx6JxW88>U2$#qhl^6KUbtmg9}D0o5vYDT7kWJthLGkpGnN4T>{St^_EU>4;DmLF9o
zr|LqsA8_MoNLQ=}w?8u!ziSZ@PC#Y<#9uJFo-ozVo6D;<8j^1$c|qAE3ZTE5i~zmE
z$BU5lw6l=EWsg^y^;8>r9qH{xfL|~PZYK#md$zZ0?o11gV<*WSW~cgy2GYGQir%wf
zt4iW8D+;s*;RGrmd(-T<@2&j(Cb9xhV*l-x`TpK`xq|7p?5R%5*s!69?2c!cC*VY*
z2DE^9pvOPLU!1e}wA8S8opcTJ3`NB>hY=JQnL~QFXR4K8A$BqJnoEB$wn-%u@E6Mh
zCfMF4kusv3N!(aHC}4)Xs^xoOwXd%e^6pi5|DZo=Q25j+6HlJ^7FodH6y1bMROR^q
zGu6)fopS`h%Sw<;ZH%TEPf+#81-#_v+@8nlR0jLcIDKQtLleOC)6yLZgC!D9X3GgS
zohwU{v$jl=quD#Go^hB{`@Qw*a%`(^jyT~=q^bWgGzRj;|12J55HWdCWV}EB|K=%N
z3Nq-qxJJ`>^|1MNN+q}zTB&ooE3j==AgK@^UW<^oSbeALa2peF)Th6{@sj0KyMNHZ
zksk1+MXN2tv+22A%cQOGpS9)77(uP9mh+!5T5ERLvF@b}$+WvXM45Z?-kCa)fb~f1
znVbTD$Gx-0Zxc`0D@YgHakge6SL0H`-vN_x?AP0>iGH0_EE&=v83hMJgaKAI0jJXm
zVxVz;X<$v6WW7}fxROO7vr#YLP;;lij5VrX{;>7kK6TtOH&6|Ar^xo>00%+u$C4@#
z>!jOt6*3><171+WxoZnKDTzJtDRw+T030;yI}~uV@9fCnei^I*j>Bp&mzP2d=FPb_
zCM*l_+$LDR3B*a!A$g#>xsrZvw0lckxmMg>0aQd7tPyN=t{dgXb;Ie+T8{fZH=gdu
zM7Rg9c(kg(Jg0?ARRRl=AONFKrvFj)lTY$KfT%6^6s`mk*ABGhsce*LsoD>K{z_M2
ziPpnu+lw22PfF!CoId^6n*G4H(Ix+#+N{C(da7<o)nCVrQ%K)QqP`yFXo7PsA<-DU
zVMn^-y!SU^P0>t1BYMGEaE#PdpOLxsVD5riQXHp@OX;`S`8VnpM~)I920w~<3|mo0
zf8~Az`*?2?H&gZ&*K&bRkV@qzvMlRHXys8*Ze2+1c?5o!^+$&MHxB@4Ee5cke52R!
zmn7AZtY6ST%ixgU5)%$<dO~q_W%Rzmn(4tRfE<xMHx$P1`u}U6@H!GZ8tEEf&cv?)
z2u#O+2S1%b{)tq(t>%QcwHj7Es-Qu^kLAPwy%7pGBw_4Q9#da^W2$}axNHr03)_nw
z5?yuNmXrI5HgS46)c5&}B)Tts49oU92>3xBLLy}FMUW=84DQbVq^;7_e7|(Sdz|&J
z73N+M`rc2rt*oSWu#7S{*s~nH6HRHJS1SmzeXk|;CA)FI4bat3<%}nkB<VHA4gqfj
zl0c&fw1Dm2e6sUf&4R3pS7y>%;;?=F>B7ms9QSxv#@+69;@>QaR?RE<L$*e~^=r_E
zM6(YEnz4sUr&1M;q>YX4&)=itG>rM{<{A79Rmk)`5ON#GL`*KX%}Ihk3w(RtM-WLt
z?f&FLF}4N^yE!(pZ&Yj&Bc`~K0@4_}*0Om?wN|}4WJ>WL;G^H2*QpgEkGA~OET-Km
zkwz|5{6dnz1U<2Pe9DNL>3g5FEIvp1jzP&2<zv~g6q4yB4PSXe1Yq;eeDSaCI$tYe
zd<>K#z~j%g6!7B;^zF+o95?fV{3mnB8*RMhCDNp>Am-3e@jNfMj?jHV$MWjk!DDKP
zkAz$Y?Sr)!GUOX}qTQ5aMh|wq1uq}~joWyKl=b_LboM#w<m`%Ex?PAOCx}KyqH|0m
zMm>i{CMuz5x6BKlA<Gnnv$B=BB8%!h*H_i-Tweiu!rKyF(6w*ztog$E7?Dn;Fsr}3
zwL`Q@oV!vslT%h4VY@}nshA9|>-<piE(ABvkYO1QD9p$yEigj)f0Cj)(&2(rbxw!V
zM%K+Ek6bSac+S_7S3O;ceo@ZQD*wDR2Tdkd<OJ+c^*EYsqI1UL^Zaq0<O)p`PIMLK
z$1kyCgIO}nO`jTwAU=at!sp{m4~1u%tP8UWy5ibk$HVQF2OM{>qy++cM01D3b7`uD
z#l6M4pI;JCypO8<S|y?OHJ-^u$MQEUXk0j9S7^e0R+yzxu2rgvqnc)8!Jfj(0GJ|#
zfKI96iqjA9&64W)LsvsI)xDh5KN*z0vDJ-~+G=~=<hD=9tEx-(&J83f7aO9jLLwyc
z;)4VHlpQ`2zPH@0X%*RsWbnz+<jsLc$^=v`tAFMl7Ri{#5|T|4UeNV&U@X@+G+gki
zfR-9a$JT8f!5P4x41Tc%J^4K-;T$xK1`JU-Q{7rnzr@AVEUhJG=PT@Pep_x+ESPlz
z0tx?tzq#;5IlYwr`sZ)IA1-}@5w1dCdU(X7bVp3{CgA;vt3_>JZ6?U&wNxR!{4oB_
zlV!x9+-&Qy6{%MQ{~yoZGkKiTSC`YS_j22~G;xUV855g2&C(zm^V!(wpcm@zn{%!g
z4}JGo(s<W9*jHf`0Z`sZNImo*zS9^}e$Hhx6?SOff0@ASakX~#!(k|vo}w9fd(?cy
zwAK`)3tyun^cNZw)rZ*mX~fh|mazC{&Xr^!lQTy`eUQx>GZ1O~to-}le<P>Um<p!Q
z<gGQ5FG|(-vlFWdETkYksRqG0&L`FE-FQ8}8w0Km*&aVL&VPE3Z_R*=0!8ED0m=#v
zHm`a~(XYG#7=I=)B-;aP4B#qGPKdDR=l}rFl{hVhe};PI53gQSx3a&9v!900Va<9R
z={~tB8-KUBmq5Ncp~B2(Z_K}=b7a=UI4je&_uXB0(>Y2RIYtNPVDpE$%vda+HD#3m
z&VuXJ{BK&Qe+rBa7eq}Q(bq|tn(RrJAk|ztj2(i{d>nmQnM?;HF2k&9sA6up5tmjl
z7lySlzMbifH17-m-Lwa_F&e7nO<lMXsPt#CNgKF%HdwG@ztDK#niqC%M#bR!wQc6I
zA52LFM%an*93hR1a$6-Q5Y3MEutAX4S=G&3@BbBIaUu5=j(<^FKOPJ4u~mgGD`9GY
z#;IN>H?ESi3#ckR3tsM+jsck3`oG!uMS}|eAwVXv>}qxwq?QY%QJ0}r@^;fhuUA9W
z*BVl>TGo&N004@xSiwDUXUvp51sVmqO3m)=B55aPwf@0=e}cN+$-BdKxY`YrT_4)0
z_d10#i44Q*rFr<T(^i|y7FsZ?QiUH5fV)rQ^pCDAt`%;DE`N^_wDGgG|9V5D{T+0f
zLdvJGflLYa)DxONTTEv{RtDYn&LmiVPZ7_9xNeE>8MC>*)v$EJvz``(pb{e&*6k+b
zsMz%($|1+8hn8c2?P(l@;Rb&CsZeYoCI3?2!LqjbwPXW3z4G$Qfj=cT5Yb%vY0(AX
oeb?AaKtwrnc|$|zzw9vfv<y6>n^aJJ!zd)XFXqqy0000001=f@-~a#s

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml
new file mode 100644
index 000000000..ca1931bca
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/colors.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+</resources>
diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml
new file mode 100644
index 000000000..7a9d314e2
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">LlamaAndroid</string>
+</resources>
diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml
new file mode 100644
index 000000000..8a24fda56
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/themes.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
+</resources>
diff --git a/examples/llama.android/app/src/main/res/xml/backup_rules.xml b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
new file mode 100644
index 000000000..148c18b65
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample backup rules file; uncomment and customize as necessary.
+   See https://developer.android.com/guide/topics/data/autobackup
+   for details.
+   Note: This file is ignored for devices older that API 31
+   See https://developer.android.com/about/versions/12/backup-restore
+-->
+<full-backup-content>
+    <!--
+   <include domain="sharedpref" path="."/>
+   <exclude domain="sharedpref" path="device.xml"/>
+-->
+</full-backup-content>
diff --git a/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
new file mode 100644
index 000000000..0c4f95cab
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample data extraction rules file; uncomment and customize as necessary.
+   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
+   for details.
+-->
+<data-extraction-rules>
+    <cloud-backup>
+        <!-- TODO: Use <include> and <exclude> to control what is backed up.
+        <include .../>
+        <exclude .../>
+        -->
+    </cloud-backup>
+    <!--
+    <device-transfer>
+        <include .../>
+        <exclude .../>
+    </device-transfer>
+    -->
+</data-extraction-rules>
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
new file mode 100644
index 000000000..50ebc8211
--- /dev/null
+++ b/examples/llama.android/build.gradle.kts
@@ -0,0 +1,5 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins {
+    id("com.android.application") version "8.2.0" apply false
+    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+}
diff --git a/examples/llama.android/gradle.properties b/examples/llama.android/gradle.properties
new file mode 100644
index 000000000..2cbd6d19d
--- /dev/null
+++ b/examples/llama.android/gradle.properties
@@ -0,0 +1,23 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app's APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Kotlin code style for this project: "official" or "obsolete":
+kotlin.code.style=official
+# Enables namespacing of each library's R class so that its R class includes only the
+# resources declared in the library itself and none from the library's dependencies,
+# thereby reducing the size of the R class for that library
+android.nonTransitiveRClass=true
diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.jar b/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..e708b1c023ec8b20f512888fe07c5bd3ff77bb8f
GIT binary patch
literal 59203
zcma&O1CT9Y(k9%tZQHhO+qUh#ZQHhO+qmuS+qP|E@9xZO?0h@l{(r>DQ>P;GjjD{w
zH}lENr;dU&FbEU?00aa80D$0M0RRB{U*7-#kbjS|qAG&4l5%47zyJ#WrfA#1$1Ctx
zf&Z_d{GW=lf^w2#qRJ|CvSJUi(^E3iv~=^Z(zH}F)3Z%V3`@+rNB7gT<C4E+e^X1+
z079LInxq~UYf-kNla?M9Qw5`wqM;O{-8tPk0sfaO{=LZmzBQ1)zwMpO|F66HKXsu0
zsblVBXkugf|5Qc(cU5;MLk9;_r~hk73h82L2Ot0dCNKa1{eNB}WN+`{?DBWLtf8fy
zvWuaUi>VU{Bb~90p|f+0(v;nz01EG7yDMX9@S~__vVgv%rS$+?IH+oZ03D5zYrv|^
zC1J)SruYHmCki$jLBlTaE5&dFG9-kq3!^i>^UQL`%gn6)jz54$WDmeYdsBE9;PqZ_
zoGd=P4+|(-u4U1dbAVQrFWoNgNd;0nrghPFbQrJctO>nwDdI`Q^i0XJDUYm|T|RWc
zZ3^Qgo_Qk$%Fvjj-G}1NB#ZJqIkh;kX%V{THPqOyiq)d)0+(r9o(qKlSp*hmK#iIY
zA^)Vr$-Hz<#SF=0@tL@;dCQsm`V9s1vYNq}K1B)!XSK?=I1)tX+bUV52$YQu*<ra!
z!v%7ZiKpO7g;NmE(;dSwu}#Qr14TWb<rzbgaS}{2FVDKaeCbmIt`T?_&=oa1ox)Gi
zqwS3lX?Fkmj%*6-JQ8ia`$(tFUJ#ol59+HHQxhli%Jb#vc@r`6ZP-EsfP2S!rwy#d
z;DP`C{cFdu2M4~`pHtE1KsUc1?BTR?&fOjbQh0|PcMiZgx_Kq$bLD%;`Ig2_LE~#`
zt32~lMNxY>0%fnWEukW>mxkz+%3-S!oguE8u#MGzST8_Dy^#U?fA@S#K$<FiPyhgu
zzq^L^|GyXf(+AWxl#$gjesG=F>S@9msUiX!gd_ow>08w5)nX{-KxqMOo7d?k2&?Vf
z&diGDtZr(0cwPe9z9FAUSD9KC)7(n^lMWuayCfxzy8EZsns%OEblHFSzP=cL6}?J|
z0U$H!4S_<U3#TVDkQ!s%Ox_BnFc2H6iNU0q=!|+Z9mk`Nbw?whndl6tI(Fj=$tl!^
zIOq<7BPlTvwG$fSu#@_%M(FvF2tpewu1-c35x~(IN{;#g5`-28n}V56vUKDyHalgc
zVFs4DD7(uszamXg!+b}p?!s)SZXGtIED*Jww1@^#7%op*kD~rw8S#!ebx(C|Oi-ci
zN~c@b8rVJSYHe*Cyn5uEa+-wenYQT6aAn!pd*%?%r>TVjj<`6dy^2j`V`)mC;cB%*
z8{>_%E1^FH!*{>4a7*C1v>~1*@TMcLK{7nEQ!_igZC}ikJ$*<$yHy>7)oy79A~#xE
zWavoJOIOC$5b6*q*F_qN1>2#MY)AXVyr$6x4b=$x^*aqF*L?vmj>Mgv+|ITnw_BoW
zO?jwHvNy^prH{9$rrik1#fhyU^MpFqF2fYEt(;4`Q&XWOGDH8k6<goGkkqwiLLr*g
z5z6x8$sF`?<e^h`j@COy$E+qY=Oj=v!b*KAnIYc*AP7qC0C#dwjl&srabEh<e-#E9
zv&sl%zw~Grb~0?}V1_A9--Q~b&NxawFDL54EJtT<qO4Z~5p7>M=%@fics4ajI;st#
zCU^r1CK&|jzUhRMv;+W~6N;u<;#DI6cCw-<Kad;hI?@=R3TXw!Cr}=BbI5m+uEl-w
zqErRfdJ>otsc@IsN3MoSD^O`eNflIoR~l4*&-%RBYk@gb^|-JXs&~KuSEmMxB}xSb
z@K76cXD=Y|=I&SNC2E+>Zg?R6E%DGCH5J1nU!A|@eX9oS(WPaMm==k2s_ueCqdZw|
z&hqHp)47`c{BgwgvY2{xz%OIkY1xDwkw!<0veB#yF4ZKJyabhyyVS`gZepcFIk%e2
zTcrmt2@-8`7i-@5Nz>oQWFuMC_KlroCl(PLSodswHqJ3fn<;gxg9=}~3x_L3P`9Sn
zChIf}8vCHvTriz~T2~FamRi?rh?>3bX1j}%bLH+uFX+p&+^aXbOK7clZxdU~6Uxgy
z8R=obwO4dL%pmVo*Ktf=lH6hnl<C=NTH%5`gfxBHGJj`iNz(A#<w&(&isR(NdjRau
zf|I4<<<u=|XXZxd2JFGs>z_5k3cG;m8lgaPp~?eD!Yn2kf)tU6PF{kLyn|oI@eQ`F
z3IF7~Blqg8-uwUuWZScRKn%c2_}dXB6Dx_&xR*n9M9LXasJhtZdr$vBY!rP{c@=)&
z#!?L$2UrkvClwQO>U*fSMs67oSj2mxiJ$t;E|>q%Kh_GzzWWO&3;ufU%2z%ucBU8H
z3WIwr$n)cfCXR&>tyB7BcSInK>=ByZA%;cVEJhcg<#6N{aZC4>K41XF>ZgjG`z_u&
zGY?;Ad?-sgiOnI`oppF1o1Gurqbi*;#x2>+SSV6|1^G@ooVy<y93LDm;wIOTj=5c$
zC-QhzvAl0y_;%{)gWRy`;Bf=?TtTe*SY!MP@9W|edu{l86Kr1<Cmr((Tem3gt4{#y
z3D<WVud@WzG8_>@fg?wyf@0Y!UZ4!}nGuLeC^l)6pwkh|oRY`s1Pm$>zZ3u-83T|9
z<QZfPf@8vDvzJ5u4A^A<#$7q&=f7lp3&n5W!oLxA;ja+?=SVAJ?`~&fZ)ozb9P1k`
z3pL1q5VB*z+Ct?<9|-*itS69vS4hVra5Z!lDKSySn;jjmUpRtte+Bax7QXjI?`90S
zA4?c)l!1W6+}k;06I}~wRC@!%R<xI9L>GaKJIV3_x+u1>cRibsaJpJqhcm%?0-L;2
zitBrdRxNmb0OO2J%Y&Ym(6*`_P3&&5Bw157{o7LFguvxC$4&zTy#U=W*l&(Q2MNO}
zfaUwYm{<Z#B+303&8&j|1AYBZQ1ef~@-GzfzfBY|H8XUzarxJ|f|I?ulc}?_jHR=S
zshz3QKN3ud>XtILD$3864IA_nn34oVa_g^FRuHL5wdUd)+W-p-iWCKe8m_cMHk+=?
zeKX)M?Dt(|{r5t7IenkAXo%&EXIb-i^w+0CX0D=xApC=|Xy(`xy+QG^UyF<x+1@YN
z-ZMAue9y-N{P;V-w=mmGh-1)C76Xu!U?t<}ByuZz$q|bl4S<l@37Jh~Glu1WLl}(l
zthb2~6ne4Q83FI<<Ay4c4`8D(It&am2#(!Ony)app0o62Q+)KCL_LQOA)tF@50mMJ
z<<yYk$({rlv3BkbtJ+SuacOk#dTZz@Qrop4gQ~UO=spb=-piAtn0x3U*bMbJdwI>e
z+#J6h_&T5i#sV)hj3D4WN%z;2+jJcZxcI3*CHXGmOF3^)JD5j&wfX)e?-|V0GPuA+
zQFot%aEqGNJJHn$!_}#PaAvQ^{3-Ye7b}rWwrUmX53(|~i0v{}G_sI9uDch_brX&6
zWl5Ndj-AYg(W9CGfQf<6!YmY>Ey)+uYd<oGrQ)76s0B|$EDBcWW*bGf!XzhAcke;%
zrh(E+RSxuWf~{)Nc=B!JS!X`$2DA4*+k72=D%60mnI%QDRarAyE}--rmK`Z^V+Yae
z&vDIckd3^cYT8wuAh(uIoqyu^LYf=&(`lZI>_J<fvopl!qQMd6Z#ZYajSPWAp}`qN
zjM~mJhn3N452gfOT)3x7am8MT`AA+VSSTZ}v@@8Ef-MrsX1vV|`Qx}Yh}~|pD;qlf
zbIL{96T!~st%_2?)f30K4Wp&xR_;627NS?#a?ONO1)C^TbaxNEe^u!aMYPy7;-Hq3
z8kFD}u19!l+a?p#Hl9<7(I=E6f85Gdi%U@U>NXH=>|`OH-CDCmcH(0%iD_aLlNHKH
z7bcW-^5+QV$jK?R*)wZ>r9t}loM@XN&M-Pw=F#xn(;u3!(3SXXY^@=aoj70;_=QE9
zGghsG3e<X)8z!NOUQ-K7Tg*M~$*Lq1YL~>kq#N||u{4We_25U=y#T*S{4I{++Ku)>
zQ!DZW;pVcn>b;&g2;YE#+V`v*Bl&Y-i@X6D*OpNA{G@JAXho&aOk(_j^weW{#3X5Y
z%$q_wpb07EYPdmyH(1^09i$ca<gJ(BN8tBcV!2)N5jxRqNZX(-f?Oe`25b>{O<}7)
zRWncXdSPgBE%BM#by!E>tdnc$8RwUJg1*x($6$}ae$e9Knj8gvVZe#bLi!<+&BkFj
zg@nOpDneyc+hU9P-;jmOSMN|*H#>^Ez#?;%C3hg_65leSUm;iz)UkW)jX#p)e&S&M
z1|a?wDzV5NVnlhRBCd_;F87wp>6c<&nkgvC+!@KGiIqWY4l}=&1w7|r6{oBN8xyzh
zG$b#2=RJp_iq6)#t5%yLkKx(0@D=C3w+oiXtSuaQ%I1WIb-eiE$d~!)b@|4XLy!CZ
z9p=t=%3ad@Ep+<9003D2KZ5VyP~_n$=;~r&YUg5UZ0KVD&tR1DHy9x)qWtKJp#Kq#
zP*8p#W(8JJ_*h_3W}FlvRam?<4Z+<U)YVE|@-H{P0L;lgD=4M+K83d`J=;XZ6cLXJ
z^Pb^7ai96hX*$t^`}X?;F@T^K^_U|s%%#VBR44vL@CQr;#z>-H77^$Lvi+#vmhL9J
zJ<1SV45xi;SrO2f=-OB(7#iNA5)x1uNC-yNxUw|!00vcW2PufRm>e~<GrW@0xg&;k
zL?iysf{CN}V{o=!B5cYHv{+Y1t=R-R>toH;M0Q85MQLWd?3O{i8H+5VkR@l9Dg-ma
ze2fZ%>G(u5(k9EHj2L6!;(KZ8%8|*-1V|B#EagbF(rc+5iL_5;Eu)L4Z-V;0HfK4d
z*{utLse_rvHZeQ>V5H=f78M3Ntg1BPxFCVD{HbNA6?9*^YIq;B-DJd{Ca2L#)qWP?
zvX^NhFmX?CTWw&Ns}lgs;r3i+Bq@y}Ul+U%pzOS0Fcv9~aB(0!>GT0)NO?p=25LjN
z2bh>6RhgqD7bQj#k-KOm@JLgMa6>%-ok1WpOe<hhSU`7#vu9^7%H57&(`pARdXnZM
z;eION-Jf1cVd1-MCh?1mFC&L}n&D)2PEzUHkSiE_t+j1!E~K#qcYI-`hc5a+<MQay
z<1}>)FS^XOU{c?d5shG(lIn3GiVBxmg`u%-j=)^v&pX1JecJics3&jvPI)mDut52?
z3jEA)DM%}BYbxxKrizVYwq?(P&19EXlwD9^-6J+4!}9{ywR9Gk42jjAURAF&EO|~N
z)?s>$Da@ikI4|^z0e{r`J8zIs>SpM~Vn^{3fArRu;?+43>lD+^XtUcY1HidJwnR6+
z!;oG2=B6Z_=M%*{z-RaHc(n|1RTKQdNjjV!Pn9lFt^4w|AeN06*j}Zyhq<HLE#a+A
zJ{@Y`FHfEI5bv;8o$ZEGCqCOp4`6-5eeTUyKzzSKd~k>Z^!-=cyGP_ShV1rGxkx8t
zB;8`h!S{LD%ot``700d0@Grql(DTt4Awgmi+Yr0@#jbe=2#UkK%rv=OLqF)9D7D1j
z!~McAwMYkeaL$~kI~90)5vBhBzWYc3Cj1WI0RS`z000R8-@ET0dA~*r(gS<U)h`aD
zLE6Dn5Q+3O*c2B9elF29G1VrakVK+eBtDr^Gf}-iF&NZnZ$t<sha>iCJmQMN&4%1D
zyVNf0?}sBH8zNbBLn>~(W{d3%@kL_eQ6jEcR{l><V*L*c8i9aN^J3k5@LC4D{kHc*
ztiPGM*MYdYK5-%KXLXp`;*IAihHn*4Pn*cYN8uZA$oHJE-8(VkYwPa2amx3wu)oxf
z;@K?9yEDA%g1*No{aF{@JJ45(#kW@dSB%B?ig4GNH`G^-kpE2q*E={oZNZKA>C|JK
z(R-fA!z|TTRG40|zv}7E@PqCAXP3n`;%|SCQ|ZS%ym$I{`}t3KPL&^l5`3>yah4*6
zifO#{VNz3)?<E9P2g)v<M^QTEwJXCARf%0EK}z-2B8QN$*7-J-mD@w9d6fzs+FVv4
zrJ6y#^xsh;O&ToHBFf~MC54(~Sb2j}sCotWQt($NS~kJz72WH?<eo8;ekt}!o;CT~
zFlF~#@UZt@DeNAn7u||SULov|N<ot`RP2<Gm?2rbIJ;;m_J~Yu)ZL<c+N#@?_iQO`
zRo%izIfJ9RLf%!Awp2T%_jV~S<vv}L?&aO;G&bt(!h60lS`2vX)v@WhGFXzOtRK;c
zyr@kt8OV3(72@wS&Pz;*6SWo#Z2jYhpl6$h+sV9U!&ep>ZL$be;NEaAk9b#{tV?V7
zP|wf5YA*1;s<)9A4~l3BHzG&HH`1xNr#%){4xZ!jq%o=7nN*wMuXlFV{HaiQLJ`5G
zBhDi#D(m`Q1pLh@Tq+L;OwuC52RdW7b8}~60WCOK5iYMUad9}7aWBuILb({5=z~YF
zt<!zJa=c$qO%0sJrsowK;!t%<S=X7Y&Jxn+nGDY&f(=_-CL}c~4K|D7ux0QbLt&^f
zSr24bBRsz?=*UjQu7gZoZ09|ewuhGV`hI~v^swhOT82+78;Tj2xT>?*Jr<A3W3T-3
z<1&Nz$ui_+E7n$IbPK2Rrimn|(;7EPx1h6Fi)-3?>5NG+WadM{mDL>GyiByCuR)hd
zA=HM?J6l1Xv0Dl+LW@w$OTcEoOda^nFCw*Sy^I@$sSuneMl{4ys)|RY#9&NxW4S)9
zq|%83IpslTLoz~&vTo!Ga@?rj_kw{|k{nv+w&Ku?fyk4Ki4I?);M|5Axm)t+BaE)D
zm(`AQ#k^DWrjbuXoJf2{Aj<e%564WZLSS(Vd#ROb?}iRVX%`%#SRY^TSh{u=kI84e
z3Ow&PB(uhOFrXgEsH|%j)I-Ug(1-{oj6dcgiDH#{Xclz$DM?xphY_S^6Z_M-j<Q{M
z3m5`Vg`@HsKN#jbtysG34Af*<`=E}BAe7{&&983YEEQfM{J8#P!otJO%!Ug4=|diY
zavSVyOcEEa#lU9-1SEeUY5cJ=<Ds##5c*EUsRi@fN(`2}Y5bh+k`b9uFL%rwlbF%M
zdSW_2pP9rtReb`cMyUFd*B2fwpi645^+_wI6EzB|mef<{4KFd8`yOM9scKF$vrUM|
zCoNlGyffVj5PnZZrY&Y5n|uW=PHnHX<YXTSdl@8HE*;J7fEIy#D!|gwR}m6V5SwWq
z(a}Z41Prc!Wk&AE!j3s3VBd*3Lw?gbM3E&oKBI9kltN8F1T6wcC{`?P4s@QUU>^KT
zFb1zMSqxq|vceV+Mf-)$oPflsO$@*A0n0Z!R{&(xh8s}=;t(lIy<Pl3uQSpJ4S8Y>
zv$S8x>m;vQNHuRzoaOo?eiWFe{0;$s`Bc+Osz~}Van${u;g(su`3lJ^TEfo~nERfP
z)?aFzpDgnLYiER<rBlW-W;>sKPu<X=38r|I{0&8J8r0^`tKAAKA8jUsvBhopXc14T
zgUW&TYxF3i;mZl;v5E+2qh>|0tq4l2wT)Atr6Qb%m-AUn6HnCue*yWICp7TjW$@sO
zm5rm4aTcPQ(rfi7a`xP7cKCFrJD}*&_~xgLyr^-bmsL}y;A5P|al8J3WUoBSjqu%v
zxC;mK!g(7r6RRJ852Z~feoC&sD3(6}^5-uLK8o)9{8L_%%rItZK9C){UxB|;G>JbP
zsRRtS4-3B*5c+K2kvmgZK8472%l>3cntWUO<l-B0`McQr%h#X$@@JrQb8#@ppQUG9
z)`zI{d#E;id*aB>VHxB|{Ay~aOg5RN<wtviA9uMGSHMzxbdkCR`SfK34yQ^bEpjN~
zPf^Bz$`=daT8!L25oqT&4i=+dRE^=E#-D#x^n<}e%YqXyp_!ut)H)b@E0UPjTon!Z
zEyxi=e>;{PJgeVD*H%ac+y!h#wi%o2bF2Ca8IyMyH{>4#{E_8u^@+l-+n=V}Sq?$O
z{091@v%Bd*<G_U#2l9N@Y9~(ygCl-h&9i8Yj9_3cV?DM!lxf11kY_LYnfWL$a@7F3
zk0$$mP<ka_iUf&lMdUX}1{nJ-EWCBw#je~5Smwg)p`tEv>3pk0^2UtiF9Z+(a@wy6
zUdw8J*ze$K#=$48IBi1U%;hmhO>lu!uU;+RS}p&6@rQila7WftH->*A4=5W|Fmtze
z)7E}jh@cbmr9iup^i%*(uF%LG&!+Fyl@LFA-}Ca#bxRfDJAiR2dt6644TaYw1Ma79
zt8&DYj31j^5WPNf5P&{)J?WlCe@<3u^78wnd(Ja4^a>{^Tw}W>|Cjt^If|7l^l)^Q
zbz|7~CF(k_9~n|h;ysZ+jHzkXf<NcXr7lv4BLC0%brrpzF{-obsvQr%!_>(*O*@5m
zLzUmbHp=x!Q|!9NVXyipZ3)^GuIG$k;D)EK!a5=8MFLI_lpf`HPKl=-Ww%z8H_0$j
ztJ||IfFG1lE9nmQ<au0Ml%cF(=R}3nZqu!gn~ulcq{4`|I}2tuFXsdOn=eq9Qz{LW
zP^cfInuhA7QFIUcH%13E<=O+Vy|WwD8Y7tHV5z9{b#53O<eDk6+LD+O<*lHC*Sh-<
zQQ*d{QylAFdIrjkp%D|@{O_AHu^XIETj$pt=r^Iy${cPPEHt@N92tqR6yn>0+jPQy
zCBdKkjArH@K7jVcMNz);Q(Q^R{d5G?-kk;Uu_IXSyWB)~KGIizZL(^&qF;|1PI7!E
zTP`%l)gpX|OFn&)M%txpQ2F!hdA~hX1Cm5)IrdljqzRg!f{mN%G~H1&oqe`5eJCIF
zHdD7O;AX-{XEV(a`gBFJ9ews#CVS2y!&>Cm_dm3C8*n3MA*e67(WC?uP@8TXuMroq
z{#<dYp{{+xCUwQw6)w9(r^`^0<F6$P{m>w$%z@CBIkRM7?}Xib+>hRjy?%G!fiw8!
z8(gB+8J~KOU}yO7UGm&1g_MDJ$IXS!`+*b*QW2x)9>K<JF7b%^NJe_9Fs0_(5~kP{
zjLlh(Gb|iDD!%2k=};N1Ay-P7$Wu_KP|<Nv<px9DnR*Bhp`@^|=#?&n9#0)w-FDC^
zYYwq=gkfiuN_5CjVN9Ao)scz%pogt!6S`BY&a-LBZ0vR8Hs`uYWwn}-oY<_#H(3U4
z+C^OQlGq~g8s<Q&bZc>~Y*E&bYMnjl6h!{17_8d!%&9D`a7r&LKZjC<&XOvTRaKJ1
zUY@hl5^R&kZl3lU3njk`3dPzxj$2foOL26r(9zsVF3n_F#<NB6ex9*oBW}l?xfbxx
zL%E!#nXs+R%q=x9ZnSy_@kOHMCrx3@5&BuHZd_<OOwDuA?g-PK!m67qWTCZlm0wKU
zv#i7p-7isYt{yv)OR8I5o;!7xop*Zm6Rp~J_EdjQW~sUFuwoyaW1U&NaV|M4U1?nR
z@)EUG9kIA_lYfhiyvI%HWy;Ge%?fKvOXHCrMb%0xeab5(rJo188D&|PUo17GFJ{-P
zs?t#M)7w#0RHw{KaZQP;u<ZWKm`x0&oP9ITEjmgxyDYbPrK*U@Usmmw0c2<WSXv-G
zvb6FvJhqfc@v@g(G}B*mRq{2pTwn0X@yc3sxqx=5S4D&>v)s5vv3<FgSd?p+)!kNm
zTv$3*E<J6vu2~)lp%4nMdtWE@=#AViY%^W&8R2_<v(+Qa%Mf{$Tn~RH;PHc?^uW;B
zl33*<G2>@dgs|lP#eylq62{<-vczqP!RpVBTgI>@O6&sU>W|do17+#OzQ7o5A$ICH
z?GqwqnK^n2%LR;$^oZM;)+>$X3s2n}2jZ7CdWIW0lnGK-<n4*9i9?XprV!b-2yz1y
z82;&r=GhqHm&zBz3e~(LKRa(8ZwPt%3TYt5`IuWWTX+)iO!EB5qiR4W$RDCd`g{-j
zZuyRBvdZnk><gEOlAyZgBQiT7{>b#EG01)P@aU`pg}th&J-TrU`tIpb5t((0eu|!u
zQz+3ZiOQ^?RxxK4;zs=l8q!-n7X{@jSwK(iqNFiRColuEOg}!7cyZi`iBX4g1pNBj
zAPzL?P^Ljhn;1$r8?bc=#n|Ed7wB&oHcw()&*k#SS#h}jO?ZB246EGItsz*;^&tzp
zu^YJ0=lwsi`eP_pU8}6JA7MS<SY@ZXteVGu2D;{e_dFnzL(OQbWI#IL1J(@}vxKG~
z{>;9pfD;DsSsLo~ogzMNP70@@;Fm8f0^;>$Z>~<d_jp#pvX)$*IF^kRoeC(j+t)Ur
zPx*FO4h)ZG=iqY;QK6W@ekh%ILEiXbr05ou^~><kZopq=@-e)e2pdVj4ZJKNAd~~=
zIWD3^k4g6+ly88l1dUP7-<u$%X=hfWc{o8z>}GWRw!W5J3tNX*^2+1f3hz{~rIzJo
z6W%J(H!g-eI_J1>0juX$X4Cl6i+3wbc~k146UIX&G22}WE<iA2ePZPFMn_vlC3@`(
zC;LX5HFTJ@i$J`Hbx0sRaj^uiUmU^s{mchNU>>0ga#WLsn9tY(&29zBvH1$`iWtTe
zG2jYl@P!P)eb<5DsR72BdI7-zP&cZNI{7q3e@?N8IKc4DE#UVr->|-ryuJXk^u^>4
z$3wE~=q390;XuOQP~TNoDR?#|NSPJ%sTMInA6*rJ%go|=YjGe!B>z6u$IhgQSwoV*
zjy3F2#I>uK{42{&IqP59)Y(1*Z>>#W8rCf4_eVsH)`v!P#^;BgzKDR`ARGEZzkNX+
zJUQu=*-ol=Xqqt5=`=pA@BIn@6a9G8C{c&`i^(i+BxQO9?YZ3iu%$$da&Kb?2kCCo
zo7t$UpSFWqmydXf@l3bVJ=%K?SSw)|?srhJ-1ZdFu*5QhL$~-IQS!K1s@XzAtv6*Y
zl8@(5BlWYL<vlUo^g==w+bwzmrMIfe1OGTk=p=b_r!$VPWnqI4+K-Q0Kn&YY_INtV
ze5P(@!H@m0AM%dGQN5FSnZ`He_+j(1bNGu}ee2>t1yAWy?rMD&bwze8bC3-GfNH=p
zynNFCdxyX?K&G(ZZ)afguQ2|<I|ToEQs5dh02L^JB6>r;XoV^=^(;Cku#qYn4<V}b
zA&FlaPRwlltsTXe9~a58?uH)L#hLx*;|`>Lus`UeKt6rAlFo_rU`|Rq<F_mt<XG?>
z&G?~iWMB<P-m~>io<78of-2X(ZYHx~=U0Vz4btyXkctMKdc9UM!vYr~B-(>)(Hc|D
zMzkN4!PBg%tZoh+=Gba!0++d193gbMk2&krfDgcbx0jI92cq?FFESVg0D$>F+bil}
zY~$)|>1HZsX=5sAZ2WgPB5P=8X#TI+NQ(M~GqyVB53c6IdX=k>Wu@A0Svf5#?uHaF
zsYn|koIi3$(%GZ2+G+7Fv^lHTb#5b8sAHSTnL^qWZLM<(1|9|QFw9pnRU{svj}_Al
zL)b9>fN{QiA($8peNEJyy`(a{&uh-T4_kdZFIVsKKVM(?05}76E<BOuTKNZ>Ez?#W
za^fiZOAd14IJ4zLX-n7Lq0qlQ^lW8Cvz4U<X+>KkV9~P}>sq0?xD3vg+$4vLm~C(+
zM{-3Z#qnZ09bJ>}j?6ry^h+@PfaD7*jZxBEY4)UG&daWb??6)TP+|3#Z&?GL?<NWw
zpo!n^_Mnx`#l`Mtp<H71ndq}LH(ZXJlzYuNAHINUglm=WWXJLtuPRY2?rIWUs=h@;
z1r9;ol65y`xO*SzvRa8o!D+_s;~<Bb-h5u71zg5ymFjIboenFkEakiwkF#l0c-sut
znM=(poO3Xq&}pnDV!O6|+_sscTBqqe%{pj$E^ku;b!#-zj>1i+280CFsE|vIXQbm|
zM}Pk!U`U5NsNbyKzkrul-DzwB{X?n3E6?TUHr{M&+R*2%yOiXdW-_2Yd6?38M9Vy^
z*lE%gA{wwoSR~vN0=no}tP2Ul5Gk5M(Xq`<W>$nw#ndFk`tcpd5A=Idue`XZ!FS>Q
zG^0w#>P4pPG<EP*xHW9WCTGn>+*NC9gLP4x2m=cKP}YuS!l^?sHSFf<ssGXChcP9X
za)*XSF8xSAX6O&AkMIyXtB<sR`J2gFTR8ck#bl<tOy9ZU%W%Jooi$>tZy{4CoQrb_
z^20(NnG`wAhMI=eq)SsIE~&Gp9Ne0nD4%Xiu|0Fj1UFk?6avDqjdXz{O1nKao*46y
zT8~iA%Exu=G#{x=KD;_C&M+Zx4+n`sHT>^>=-1YM;H<72k>$py1?F3#T1*ef9mLZw
z5naLQr?n7K;2l+{_uIw*_1nsTn~I|kkCgrn;|G~##hM;9l7Jy$yJfmk+&}W@JeKcF
zx@@Woiz8qdi|D%aH3XTx5*wDlbs?dC1_nrFpm^QbG@wM=i2?Zg;$VK!c^Dp8<}BTI
zyRhAq@#%2pGV49*Y5_mV4+OICP|%I(dQ7x=6Ob}>EjnB_-_18*xrY?b%-yEDT(wrO
z9RY2QT0`_OpGfMObKHV;QLVnrK%mc?$WAdIT`kJQT^n%GuzE7|9@k3ci5fYOh(287
zuIbg!GB3xLg$YN=n)^pHGB0jH+_iIiC=nUcD;G6LuJsjn2VI1cyZx=a?ShCsF==QK
z;q~*m&}L<-cb+mDDXzv<F>vrRsybcgQ;Vg21P(uLv5I+eGc7o7tc6`;OA9{soHFOz
zT~2?>Ts}gprIX$wRBb4yE>ot<8+*Bv`qbSDv*VtRi|cyWS>)Fjs>fkNOH-+PX&4(~
z&)T8Zam2L6puQl?;5zg9h<}k4#|yH9czHw;1jw-pwBM*O2hUR6yvHATrI%^mvs9q_
z&ccT0>f#eDG<^WG^q@oVqlJrhxH)dcq2cty@l3~|5#UDdExyXUmLQ}f4#;6fI{f^t
zDCsgIJ~0`af%YR%Ma<z>5VQq-p21k`vaBu6WE?66+5=XUd%Ay%D$irN>5LhluRWt7
zov-=f>QbMk*G##&DTQyou$s7UqjjW@k6=!I@!k+S{pP8R(2=e@io;N8E`EOB;OGoI
zw6Q+{X1_I{OO0HPpBz!X!@`5YQ2)t{+!?M_iH25X(d~-Zx~cXnS9z>u?+If|iNJbx
zyFU2d1!ITX64D|lE0Z{dLRqL1Ajj=CCMfC4lD3&mYR_R_VZ>_7_~|<^o*%_&jevU+
zQ4|qzci=0}Jydw|LXLCrO<tY^5#SIaz%jR-X`&*7)+bTVSc1BE2BLn%vVH`Mbz1R-
z>l1_P6Xf@c0$ieK2^7@A9UbF{@V_0p%lqW|L?5k>bVM8|p5v&2g;~r>B8uo<4N+`B
zH{J)h;SYiIVx@#jI&p-v3dwL5QNV1oxPr8J%ooezTnLW>i*3Isb49%5i!&ac_dEXv
zvXmVUck^QHmyrF8>CGXijC_R-y(Qr{3Zt~EmW)-nC!tiH`wlw5D*W7Pip;T?&j%kX
z6DkZX4&}iw>hE(boLyjOoupf6JpvBG8}jIh!!VhnD0>}KSMMo{1#uU6kiFcA04~|7
zVO8eI&x1`g4CZ<2cYUI(n#wz2MtVFHx47yE5eL~8bot~>EHbevSt}LLMQX?odD{Ux
zJMnam{d)W4da{l7&y-JrgiU~qY3$~}_F#G7|MxT)e;G{U`In&?`j<5D->}cb{}<s=
z)!EcZ!rs<Y#@^oA)#0D)Gvy7r4FLpRXcD*RfTd(<C=@A5lICI1^#y3rLIVLJon(Px
z9#Jw)(4Yq5v4TSV<tUJH3ExZMzKTk&i(qL2_(Map=flfs&WkPnAHQ!Ph9FQ-#b`+n
zGGm<qkbNX1D53P^JDqBMk-0!hNJ&trQIk`mzGOz)`{-cJ&~H;?Q%CleBz;+Wy0Yj`
zyHSagKdo#qV6?6Vcv+pcT%^1Q-l@v({R}QyX*+WEw%BJmI(}Q@j4m`CawF`x{MPHL
za$h&MF~4nMuyWgQrt}RgV#pg|Y^CiI*j!3z!tB-Jp4;1uuh(==9iU5dSb3$ZFATE!
z>{T(4DF0BOk<QYr>-=1195KB-E*o@c?`>y#4=dMtYtSY=&L{!TAjFVcq0y@AH`vH!
z$41+u!Ld&}F^COPgL(EE{0X7LY&%D7-(?!kjFF7=qw<;`V{nwWBq<)1QiGJgUc^Vz
ztMUlq1bZqKn17|6x6iAHbWc~l1HcmAxr%$Puv!znW)!JiukwIrqQ00|H$Z)OmGG@=
zv%A8*4cq}(?qn4rN6o`$Y))(MyXr8R<2S^J+v(wmFmtac!%VOfN?&(8Nr!T@kV`N;
z*Q33V3t`^rN&aBiHet)18wy{*wi1=W!B%B-Q6}SCrUl$~Hl{@!95ydml@FK8P=u4s
z4e*7gV2s=YxEvskw2Ju!2%{8h01<bQ)d%wXmu1Zpj%~-&jsiWxq+->rx-3`NCPc(O
zH&J0VH5etNB2KY6k4R@2Wvl^Ck$MoR3=)|SEclT2ccJ!RI9Nuter7u9@;<u*AgOz(
zn=C13+36e?Wp?2OIME<hmtjrXzJ<Zd*?$>sWf-%um;GfI!=eEIQ2l2p_YWUd{|6EG
ze{yO6;lMc>;2tPrsNdi@&1K6(1;|$xe8vLgiouj%QD%gYk`4p{Ktv9|j+!OF-P?@p
z;}SV|<w!@0*CQ5@xp9@`8c_*)IC@^rAGd{(#wPf?$`(^V&!%1qI&#?UztvBAF!4M;
z_oxBXB0!;X3yhd^D}+Xx4sUHZH*0n|si;UgfM!*1c|d1h4nY076_94CJP`FR$D}_!
zDgwP#mZV0tbmF7vmG7Log$Afqr(GuMl<urHsSR(EhO7^7wNPIUT%rDwjj%sGil746
zDLtAZLp-7)K|QJh+bT3@0I$b@q3|9LuBZk*!Xn-Gb?+~>oIK)iwlBs+`ROXkhd&NK
zzo__r!B>tOXpBJMDcv!Mq54P+n4(@dijL^EpO1wdg~q+!DT3lB<>9AA<tf}r`cy*Y
zjhdtI5OMNT6H0#L@X?3Sm%kGA7Vl5JMh4bZuEy3uPM@!CETCEPH`bN;-XzRi=Uj<*
zy1%%&-XKAU$eorwmA2>NSe!T1XgC=J^)IP0XEZ()_vpu!!3HQyJhwh?r`Ae%Yr~b%
zO*NY9t9#qWa@GCPYOF9aron7thfWT`eujS4`t2uG6)~JRTI;f(ZuoRQwjZjp5Pg34
z)rp$)Kr?R+KdJ;IO;pM{$6|2y<ydqUT>=k_siqvp%)2||cHTe|b5Ht8&A{wazGNca
zX$Ol?H)E_R@SDi~4{d-|8nGFhZPW;Cts1;08TwUvLLv&_2$O6Vt=M)X;g%HUr$&06
zISZb(6)Q3%?;3r~*3~USIg=HcJhFtHhIV(siOwV&QkQe#J%H9&E21!C*d@ln3E@J*
zVqRO^<)V^ky-R|%{(9`l-(JXq9J)1r$`uQ8a}$vr9E^nNiI*thK8=&UZ0dsFN_eSl
z(q~lnD?EymWLsNa3|1{CRPW60>DSkY9YQ;$4o3W7Ms&@&lv9eH!tk~N&dhqX&>K@}
zi1g~GqglxkZ5pEFkllJ)Ta<O6(=j@N_YA4+m(2{`eegY!ZQ2^vALR<RgI6}@oJ;#Q
znz8Q+-c~%`4rP2NC%pl75Va8USgVLiZL*d#F+vsZ>1I^c&Bt6#r(QLQ02yHTaJB~-
zCcE=5tmi`UA>@P=1LBfBiqk)HB4t8D?02;9eXj~kVPwv?m{5&!&TFY<knKM4n)!Ph
zud#tQR<C%y^0~@DM`a6)LueXb{y5yQ{QdB(pAh_Nx5%(@`(@LGcfv~*Wnh>hu>3=_
zsGmYZ^mo*-j69-42y&Jj0cBLLEulNRZ9vXE)8~mt9C#;tZs;=#M=1*hebkS;7(aGf
zcs7zH(I8Eui9UU4L--))yy<O?Y9%&u#eL}xwSxY`c_7W7JcW|iGpGkga)aNR1NtnV
zsQ!z$?wFhYyP2W>`&d&$In&VA2?DAEss4LAPCLd>-$i?lpXvn!gu^JJ$(DoUlc<UX
zkfN^{^6VUfk4qkMm$Kwn8iV7sNct*PLMd9WvC5vZDv+`U&Q-ZGnQrAf`3aDr3@9ll
zcH!Spxa>6wE98VLZ*z`QGQov5l4Fm_h?V-;mHLYDVOwKz7>e4+%AzeO>P6v}ndPW|
zM>m#6Tnp7K?0mbK=>gV}=@k*0Mr_PVAgGMu$j+pWxzq4MAa&jpCDU&-5eH27Iz>m^
zax1?*HhG%pJ((tkR(V(O(L%7v7L%!_X->IjS3H5kuXQT2!ow(;%FDE>16&3r){!ex
zhf==oJ!}YU89C9@mfDq!P3S4yx$aGB?rbtVH?sHpg?J5C->!_FHM%Hl3#D4eplxzQ
zRA+<@LD%LKSkTk2NyWCg7u=$%F#;SIL44~S_OGR}JqX}X+=bc@swpiClB`Zbz|f!4
z7Ysah7OkR8liXfI`}IIwtEoL}(URrGe;IM8%{>b1SsqXh)~w}P>yiFRaE>}rEnNkT
z!HXZUtxUp1NmFm)Dm@-{FI^aRQqpSkz}ZSyKR%Y}Y<Q&WCZ#r6E1BX>HNzBk)ZIp}
zMtS=aMvkgWKm9&oTcU0?S|L~CDqA+sHpOxwnswF-fEG)cXCzUR?ps@tZa$=O)=L+5
zf%m58cq8g_o}3?Bhh+c!w4(7AjxwQ3>WnVi<{{38g7yFboo>q|+7qs<$8CPXUFAN<
zG&}BHbbyQ5n|qqSr?U~GY{@GJ{(Jny{bMaOG{|IkUj7tj^9pa9|FB_<+KHLxSxR;@
zHpS$4V)PP+tx}22fWx(Ku9y+}Ap;VZqD0AZW4gCDTPCG=zgJmF{|x;(rvdM<f^n;1
zK8nxqYR0vKH0j_lDlpPKZ&MGwB{uMn`Y?-pX_!Y3B)-jRTo&O^8pthSIE0&!0lK-e
z!icB)w$6c%`*wHe1Fz?U<?TZjyK8JuG0UC$ZCke^hB6%|%?F1X$1uXP^O6o2PCxRi
zb_5>|2|9a}cex6xrMkERnkE;}jvU-kmzd%_J50$M`lIPCKf+^*zL=@LW`1SaEc%=m
zQ+lT06Gw+wVwvQ9fZ~#qd430v2HndFsBa9WjD0P}K(rZYdAt^5WQIvb%D^Q|pkVE^
z<umx#lsJRB+%c;<9rnPd8EZLPtdXF8y%S2H@ouSr1qOSZk)i$^f3$XJg9S<yWHU+7
zyJJ%uaeqUu9^B#_n2Ir_nIYz}G3SZ_C|5(`c6Umw#_yBgS{UuH<_&oA!pzsBV%g@=
zbOzA1`OA)5@fUbFgE?}SVsAWLGVW?bLByY#LXc_UE*PeNYf={+;y-vKh@$+ue#tsu
zvCKGNIefnaDUw~m>te$&#~zmULFACGfS#g=2OLOnIf2Of-k!(BIHjs77nr!5Q1*I9
z1%?=~#Oss!rV~?-6Gm~BWJiA4mJ5TY&iPm_$)H1_rTltuU1F3<q3^BkZ&_=uH2O^K
z_@P51{S(`LSn2c$B(%yIo=USrBfbZd`d}}bPg&0tq#f2!iBZc9Z>I(qTQ^U$S>%$l
z)Wx1}R?ij0idp@8w-p!Oz{&*W;v*IA;JFHA9%nUvVDy7Q8woheC#|8QuDZb-L_5@R
zOqHwrh|mVL9b=+$nJxM`3eE{O$sCt$UK<z2Kci>^2@L$R(r^-_+z?lOo+me-VW=Zw
z-Bn>$4ovfWd%S<yVaGCiuR4}{b|0tZrdZYdj>PY`ab-u9{INc*k2h+yH%toDHIyqQ
zO68=u`N}RIIs7lsn1D){)~%>ByF<>i@qFb<-axvu(Z+6t7v<^z&gm9McRB~BIaDn$
z#xSGT!rzgad8o>~kyj#h1?7g96tOcCJniQ+*#=b7wPio>|6a1Z?_(TS{)KrPe}(8j
z!#&A=k(&Pj^F;r)CI=Z{LVu>uj!_W1q4b`N1}<u;U_b=ty|*S;DXz*F-)!)F0Pv+Q
zRm=!T^zTn*A6)$bH1cl>E(i%;BWjbEcnD=mv$FL$l?zS6bW!{$7j1GR5ocn94P2u{
z70tAAcpqtQo<@cXw~@i-@6B23;317|l~S>CB?hR5qJ%J3EFgyBdJd^fH<ao26B*v)
zGUaiB1_W^rk+d9W+h~_tj2D}FfPY~B-BL~)lzp|oFVck~{r8sIIlCCz*!+vHo}=OE
zgW`_*^W8W`lLWY+AcSs_rDfwxzeg23BqYRWi$p*e3{sqP3719K#C&l{6X2y_TO;0c
zk>Zu7AzHF(BQ!tyAz<BOKd)9J&U=CXtSstlZ^pj1MMKG$H~T%~{<Zzl`|=?>^L0`X
z23S4Fe{2X$W0$zu9gm%rg~A>ijaE#GlYlrF9$ds^QtaszE#4M(OLVP2O-;XdT(XIC
zatwzF*)1c+t~c{L=fMG8Z=k5lv>U0;C{caN1NItnuSMp)6G3mbahu>E#sj&oy94KC
zpH}8oEw{G@N3pvHhp{^-YaZeH;K+T_1AUv;IKD<=mv^&Ueegrb!yf`4VlRl$M?wsl
zZyFol(2|_QM`e_2lYSABpKR{{NlxlDSYQNkS;J66aT#MSiTx~;tUmvs-b*CrR4w=f
z8+0;*th6kfZ3|5!Icx3RV11sp=?`0Jy3Fs0N4GZQMN=8HmT6%x9@{Dza)k}UwL6JT
zHRDh;%!XwXr6yuuy`4;Xsn0zlR$k%r%9abS1;_v?`HX_hI|+EibVnlyE@3aL5vhQq
zlIG?tN^w@0(v9M*&L+{_+RQZw=o|&BRPGB>e5=ys7H`nc8nx)|-g;s7mRc7hg{GJC
zAe^vCIJhaj<rU9d-Ny$?&}6qolDp7MDNB|ftJI`wi&EhaNhHyV!qQfb_Q>mm7C6g!
zL&!WAQ~5d_5)00?w_*|*H>3$loHrvFbitw#WvLB!JASO?#5Ig5$Ys10n>e4|3d;tS
zELJ0|R4n3Az(Fl3-r^QiV_C;)lQ1_CW{5bKS15U|E9?ZgLec@%kXr84>5jV2a5v=w
z?pB1GPdxD$IQL4)G||B_lI+A=08MUFFR4MxfGOu07vfIm+j=z9tp~5i_6jb`tR>qV
z$#`=BQ*jpCjm$F<t%->0+F)L%xRlnS%#&gro6PiRfu^l!EVan|r3y}AHJQOORGx4~
z&<)3=K-tx518DZyp%|!EqpU!+X3Et7n2AaC5(AtrkW>_57i}$eqs$rupubg0a1+WO
zGHZ<ibB|u&Jk?tUGE|?~tl5Wk^jlF-{lPR;A5i_2TUJp0F;38(es)rx!d-0-m4P-!
z$~|tV-l!W$kj%u&D~eY>KLN2L0D;ab%{_S1Pl<uJj0^JDir_rTS5CizT^_%RU3Cwc
zfrHnUz@7T<9U{4O%SD*qhHiuSo|}zv3Hju=+>m|hx8R?O14*w*f&2&bB050n!R2by
zw!@XOQx$SqZ5I<(Qu$V6g>o#A!JVwErWv#(Pjx=KeS0@hxr4?13zj#oWwPS(7Ro|v
z>Mp@Kmxo79q|}!5qtX2-O@U&&@6s~!I&)1WQIl?lTnh6UdKT_1R640S4~f=_xoN3-
zI+O)$R@RjV$F=>Ti7BlnG1-cFKCC(<Cu6MUX_IBo_X35UX_<48O%CsD25V#~38R@v
zrtYIkXvf1CPBiwCyX%=srhV;xX%`i(ICDDA6?ULc$>t|Qjm{SalS~V-t<tWc<BV1;
zl_jTzT9WFjJ~QeHdyK~l8BJDMnzlnWR?wVpES%e_)7o6AC7w~n8DW>X#+2ekRhwmN
zZr`8{QF6y~Z!D|{=1*2D-JUa<(1Z=;!Ei!KiRNH?o{p5o3crFF=_pX9O-YyJchr$~
zRC`+G+8kx~fD2k*ZIiiIGR<8r&M@3H?%JVOfE>)})7ScOd&?OjgAGT@WVNSCZ8N(p
zuQ<bOx+QY5+%zcISi3vNjYInP2!bhID0f~o9vf+-rhLL{z1^Ck(#`qsgx*#5swyXT
zW>G~76GE3%(%h1*vUXg$vH{ua0b`sQ4f0*y=u~lgyb^!#CcPJa2mkSEHGLsnO^kb$
zru5_l#nu=Y{rSMWiYx?nO{8I!gH+?wEj~UM?IrG}E|bRIBUM>UlY<`T1EHpRr36vv
zBi&dG8oxS|J$!zoaq{+JpJy+O^W(nt*|#g32bd&K^w-t>!Vu9N!k9eA8r!Xc{utY>
zg9aZ(D2E0gL#W0MdjwES-7~Wa8iubPrd?8-$C4BP?*wok&O8+ykOx{P=Izx+G~hM8
z*9?BYz!T8~dzcZr#ux8kS7u7r@A#DogBH8km8Ry4slyie^n|GrTbO|cLhpqgMdsjX
zJ_LdmM<k-VT(PJj^o5EU9N}k~N8WtTC-IGZ`@y!j$sb^<J0;G@Qw2FyCS!3I_NA$4
z1f*^zNnJ-2I0{osRtQts^V?d(XVo8U_KxuKrC4_(t;qw3T5EIb1gaTiFo3yT&HyFL
z)1@c>#I&4LqqsOUIXK8gW;V0B(7^$y#h3h>J0k^WJfAMeYek%Y-Dcb_+0zPJez!GM
zAmJ1u;*rK=FNM0Nf}Y!!P9c4)HIkMnq^b;JFd!S3?_Qi2G#LIQ)TF|iHl~WKK6JmK
zbv7rPE6VkYr_%_BT}CK8h=?%pk@3cz(UrZ{@h40%XgT<XHMp$hv;h9Ymj#=odkk`i
zIQae`_=X#DiyU&N400<DsTZGknpd*i59`VZ(hlvyF={mwt^rqfpTI2&<T<g_&ag;)
zGmdv3$7_Umm3*d1pC<tD4&^z@fB#qeFKt~J{6LxaG(+}hgg|*Exu--o{)yT#JuFbk
z1;57xTR<^av=7#O$H&Fm{_yrRCu{yYJoJ}c0L{yx#X=<de!)a1JlNVp083z$!k1?|
zV)^ps1(-LWXGsXjQ$H~1;~fb%PNBhFP?}VP6a;UunlMt9FKF=Z7zjl{Vm5wAE|rFq
z1-k1JTp~NHKqro%r!pzCx_v|wc?QzZb;2~gF~c_e(7kI?gTf@Sl;&2zZGHBdXMu10
ze@7*&Fo`5Se_O_wf2XJaP0LvFpQz-YM_~(7W9xrFv6N(8d1OJ9&>hP*-Oeo`T0eq9
zA8BnWZKzCy5e&&_GEsU4*;_k}(8l_&al5K-V*B<vk;3_5Y5GsaQRXITZ7En5+1cr}
zoEOfUwTvx(fB#P)g+XFq$3s&MIR%RGTY*w)u1F^x)_4KQF{~iPr`KL;JvOUA!)Bl1
z9p@=a4SF1Po>FM=O~;MgRkYsOs%9eOY6s6AtE*<7GQAR2ulC3RAJrG_P1iQK5Z~&B
z&f8X<>yJV6)oDGIlS$Y*D^Rj(cszTy5c81a5IwBr`BtnC6_e`ArI8CaTX_%rx7;cn
zR-0?J_LFg*?(#n~G8cXut(1nV<GD8u>F0Oka$A<Xrs+39Fcy_UX4+wJsyL;Ad#|W_
zoZzvm=HX_}HyBFXw08LR4`!>$1FGcERU<^ggx;p@CZc?3UB41RY+wLS`LWFNSs~YP
zuw1@DNN3lTd|jDL7gjBsd9}wIw}4xT2+8dBQzI00m<@?c2L%>}QLfK5%r!a-iII`p
zX@`VEUH)uj^$;7jVUYdADQ2k*!1O3WdfgF?OMtUXNpQ1}QINamBTKDuv19^{$`8A1
zeq%q*O0mi@(%sZU>Xdb0Ru96CFqk9-L3pzLVsMQ`Xpa~N6CR{9Rm2)A|CI21L(%GW
zh&)Y$BNHa=FD+=mBw3{qTgw)j0b!Eahs!rZnpu)z!!E$*eXE~##yaXz`KE5(nQM`s
zD<pRzTt9Ga7_+7V*(vgeNjPLq#T#Hzh4oMyk4m^&mDHa-;LXM`BMlpNPVXZiWB!7-
zsrLYk0v?{Pinwui>!$vW9XH)iMxu9R>r$VlLk9oIR%HxpUiW=BK@4U)|1WNQ=mz9a
z^!KkO=>GaJ!GBXm{KJj^;kh-MkUlEQ%lza`-G&}C5y1>La1sR6hT=d*NeCnuK%_LV
zOXt$}iP6(YJKc<sy3IAHEj2Y-R)3MS?rC66If(_;`nr~Onw70}P1hEBm+!itUy1C`
zNpHpki6_MC$7{&PcGt_M^XxtUNv`)v*iXj|1|scVAGjs`iL^4oZ_EXmgi;5b%!&n+
ziIZl66eo#;Grax0|H0Th2FKohS;Mhy+qQFJ+qP}z#I|iaIk9cqPEKq)Z~pVlJTr68
zJXP<9U-g%+tE>9j-Fxq~*ItVUqljQ8?oaysB-EYtFQp9oxZ|5m0^Hq(qV!S+hq#g(
z?|i*H2MIr^Kxgz+3vIljQ*Feejy6S4v~jKEPTF~Qhq!(ms5>NGtRgO5vfPPc4Z^AM
zTj!`5xEreIN)vaNxa|q6qWdg>+T`Ol0Uz)ckXBXEGvPNEL3R8hB3=C5`@=SYgAju1
z!)UBr{2~=~xa{b8>x2@C7weRAEuatC)3pkRhT#pMPTpSbA|tan%U7NGMvzmF?c!V8
z=pEWxbdXbTAGtWTyI?Fml%lEr-^AE}w#l(<7OIw;ctw}imYax&vR4UYNJZK6P7ZOd
zP87XfhnUHxCUHhM@b*NbTi#<F%^Z|eYk;z2_lk&RVh7s<F5xO2t#&+J(_Y0MwZqrY
zicy(G9z6&?{09>(-8|wcv%3BGNs#zRCVV(W?1Qj6^PPQa<{yaBwZ`+<`w|;rqUY_C
z&AeyKwwf*q#OW-F()lir=T^<^wjK65Lif$puuU5+tk$;e_EJ;Lu+pH>=-8=PDhkBg
z8cWt%@$Sc#C6F$Vd+0507;{OOyT7Hs%nKS88q-W!$f~9*WGBpHGgNp}=C*7!RiZ5s
zn1L_DbKF@B8kwhDiLKRB@lsXVVLK|ph=w%_`#owlf@s@V(pa`GY$8h%;-#h@<I6M_
zmQBeLGoxDbjVm1w<bC@g^^f(C!^JL|q^~JGpu4s;6B8qK+5^Y5Qq9@Z$yi1AO_ivY
zEj2e_U?RYgY($+y4MYyPw@#Znh;BBeSCVn{gx>TsO|Y8V=n@*!Rog7<7Cid%apR|x
zOjhHCy<N%=9>fbIt%+*PCveTEcuiDi%Wx;O;+K=W?OF<?k#V0K7HZbJn*d3I+eoH#
zh7<l$XDBh2e3X@S)0xR1RL*X3v}anZNmTRLP100Vjg6lZsj8M4WV<xI)QGou52W;l
zuhA|fVCliO=-1Jr275Sx<Y+M^NR?|nFbw_$)30P*nj@^6+|ZdgJZ18l!}qu2UJUSH
z1R0|V6_~e{(m?E}wM3qUG_Q$t&XB<F<iQ0B?7yghy}e3`PL?Jnn_FH7zs7FvV}ZNC
zg-k1nj;~OaAd0bVj&A17jIs@8j;@`1Da87%aS+QW)3w-0g!{V<Z8Jk0K;t-e-m_}H
z#$DpTS=ZJ!079fqU)_vsqe{Hxz{t*PFrYaR_X{Idl~pteGXU-!HXH+n-Nm@~!Nn|q
zjK^|6Bi^u9r}}mzbV)k27Kxg2T1S^j^-m&I%;>UV%)%~6;gl?<0%)?snDDqIvkHF{
zyI02)+lI9ov42^hL>ZRrh*HhjF9B$A@=H94iaBESBF=eC_KT$8A@uB^6$~o?3W<f9
zYtSca#$B|92$2MdaDc&Gb6a^9xjGKojcdPg>m5t1OIaqF^~><2?4e3c&)@wKn9bD?
zoeCs;H>b8DL^F&>Xw-xjZEUFFTv>JD^O#1E#)CMBaG4DX9bD(Wtc8Rzq}9soQ8`jf
zeSnHOL}<+WVSKp4kkq&?SbETjq6yr@4%SAqOG=9E(3YeLG9dtV+8vmzq+6PFPk{L;
z(&d++iu=^F%b+ea$i2UeTC{R*0Isk;vFK!no<;L+(`y`3&H-~VTdKROkdyowo1iqR
zbVW(3`+(PQ2>TKY>N!jGm<sD6AnGF=)0Re*7@0HIc_nG=Iw;>Go7oeoB8O|P_!Ic@
zZ^;3dnuXo;WJ?S+)%P>{Hcg!Jz#2SI(s&dY4QAy_vRlmOh)QHvs_7c&zkJCmJGVvV
zX;Mtb>QE+xp`KyciG$Cn*0?AK%-a|=o!+7x&&yzHQOS>8=B*R=niSnta^Pxp1`=md
z#;$pS$4WCT?mbiCYU?FcHGZ#)kHVJTTBt^%XE(Q};aaO=Zik0UgLcc0I(tUpt(>|&
zcxB_|fxCF7>&~5eJ=Dpn&5Aj{A^cV^^}(7w#p;HG&Q)EaN~~EqrE1qKrMAc&W<U9$
z+^)SZeHD%7BgO}J?hdzGer@n(wj6EYe>XIE;>@<&)5;gD2?={Xf@Mvn@OJKw=8Mgn
z!JUFMwD+s==JpjhroT&d{$kQAy%+d`a*XxDEVxy3`NHzmITrE`o!;5ClXNPb4t*8P
zzAivdr{j_v!=9!^?T3y?gzmqDWX6mkzhIzJ-3S{T5bcCFMr&RPDryMcdwbBuZbsgN
zGrp@^i?rcfN7v0NKGzDPGE#4yszxu=I_`MI%Z|10nFjU-UjQXXA?k8Pk|OE<(?ae)
zE%vG#eZAlj*E7_3dx#Zz4kMLj>H^;}33UAankJiDy5ZvEhrjr`!9eMD8COp}U*hP+
zF}KIYx@pkccIgyxFm#LNw~<U{`Via^B{gMVX)!#|jCh_xM6ikcrvnXbWa+dwBKVSS
zK-4G%y><Bux_?U*2>G&`;o&5)2`5aogs`1~7cMZQ7zj!%L4E`2yzlQN6REX20&O<9
zKV6fyr)TSc<e)zOh9v0_vMgh>JPPzNTC2gL+0x#=u>(({{D7j)c-%tvqls3#Y?Z1m
zV5WUE)zdJ{$p>yX;^P!UcXP?UD~YM;IRa#Rs5~l+*$&nO(;Ers`G=0D!twR(0GF@c
zHl9E5DQI}Oz7<JMjFG_&ureTdMV@FLw-eP(lgkhig$ECHD*ei9Xc()#hw||V#@>4n
zfKP>&$q0($T4y$6w(p=ERAFh+>n%iaeRA%!T%<^+p<H-<uJNd-sy)}Y@~CN#3u5W*
zjI&ROZ_Rx}0(a!_QCcJ9_!&~Uc}E0JQd?M`bT;-C9-Z?5E@UMa&Fe8b7GLT8{XPb5
zyf%i|Q0Xl+SI;QD#Yg>g?M)@ucY<&59$x9M#n+V&>}=nO9wCV{O~lg&v#+jcUj(tQ
z`0u1YH)-`U$15a{pBkGyPL0THv1P|4e@pf@3IBZS4dVJPo#H>pWq%Lr0YS-SeWash
z8R7=jb28KPMI|_lo#GEO|5B?N_e``H*23{~a!AmUJ+fb4HX-%QI@lSEUxKlGV7z7Q
zSKw@-TR>@1RL%w{x}dW#k1NgW+q4yt2Xf1J62Bx*O^WG8OJ|FqI4&@d3_o8Id@*)4
zYrk=>@!wv~mh7YWv*bZhxqSmFh2Xq)o=m;%n$I?GSz49l1$xRpPu_^N(vZ>*>Z<04
z2+rP70oM=NDysd!@fQdM2OcyT?3T^Eb@lIC-UG=Bw{BjQ&P`KCv$AcJ;?`vdZ4){d
z&gkoUK{$!$$K`3*O-j<Zi{jTPO?eI9$VNEqKns5`DQpeK-2u;%g&U>yM1~<ZC)5l@
zIkr@87e@uhA6MH$K<@Z>p-7T*qb)Ys>Myt^;<CgA4AYJg%~h2T>#1&a%O@x8A+E>!
zY<A2w=yT8AF-EkBw>8=eD`ZG)LVagDLBeHg>=atOG?Kr%h4B%E6m@J^C+U|y)XX@f
z8oyJDW|9g=<#f<{JRr{y#~euMnv)`7j=%cHWLc}ngjq~7k**6%4u>Px&W%4D94(r*
z+akunK}O0DC2A%Xo9jyF;DobX?!1I(7%}@7F>i%&nk*LMO)bMGg2N+1iqtg+r(70q
zF5{Msgsm5GS7DT`kBsjMvOrkx&|EU!{{~gL4d2MWrAT=KBQ-^zQCUq{5PD1orxlIL
zq;CvlWx#f1NWvh`hg011I%?T_s!e38l*lWVt|~z-PO4~~1g)SrJ|>*tXh=QfXT)%(
z+ex+inPvD&O4Ur;JGz>$sUOnWdpSLcm1X%aQDw4{dB!cnj`^muI$CJ2%p&-kULVCE
z>$eMR36kN$wCPR+OFDM3-U(VOrp9k3)lI&YVFqd;Kpz~K)@Fa<T;^H>&FRw}L(SoD
z9B4a+hQzZT-BnVltst&=kq6Y(f^S4hIGNKYBgMxGJ^;2yrO}P3;r)(-I-CZ)26Y6?
z&rzHI_1GCvGkgy-t1E;r^3Le30|%$ebDRu2+gdLG)r=A~Qz`}~&L@aGJ{}vVs_GE*
zVUjFnzHiXfKQbpv&bR&}l2bzIjAooB)=-XNcYmrGmBh(&iu@o!^hn0^#}m2yZZUK8
zufVm7Gq0y`Mj;9b>`c?&PZkU0j4>IL=UL&-Lp3j<Z6B(#r?G=5OTIVqppQV7;$oUZ
z_*bfNYVfkU+>&47B5pAW4JceG{!XCA)kT<%2nqCxj<)uy6XR_uws~>_MEKPOpAQ!H
zkn>FKh)<9DwwS*|Y(q?$^N!6<o$@lIm`~9<Ur$vhpA3&|(gF`!3*mR7y8%R>(51O0
z^JM~Ax{AI1Oj$fs-S5d4T7Z_i1?{%0SsIuQ&r8#(JA=2iLcTN+?>wOL532%&dMYkT
z*T5xepC+V6zxhS@vNbMoi|i)=rpli@R9~P!39tWbSSb904ekv7D#quKbgFEMTb48P
zuq(VJ+&L8aWU(_FCD$3^uD!YM%O^K(dvy~Wm2hUuh6bD|#(I39Xt>N1Y{ZqXL`Fg6
zKQ?T2htHN!(Bx;tV2bfTtIj7e)liN-29s1kew>v(D^@)#v;}C4-G=7x#;-dM4yRWm
zyY`cS21ulzMK{PoaQ6xChEZ}o_#}X-o}<&0)$1#3we?+QeLt;aVCjeA)hn!}UaKt<
zat1fHEx13y-rXNMvpUUmCVzocPmN~-Y4(YJvQ#db)4|%B!rBsgAe+*yor~}FrNH08
z3V!97S}D7d$zbSD{$z;@IYMxM6aHdypIuS*pr_U6;#Y!_?0i<Th%7C5rAWptzT;+8
z^q+L14xti*_qZw!_5cZk%7|55dKPFjM~R1_=~fye=uVS6nDqhi)PszECfuz7L#WMP
z!r+BR+k)}Z4nk;GJ*^~}T(1)9OIq4cSY?CPrYrC|{k3|*qwa%HsQZRC>|&yU*@16l
z*dcMqDQgfNBf}?quiu4e>H)yTVfsp#f+Du0@=Kc41QockXkCkvu>FBd6Q+@FL!(Yx
z2`YuX#eMEiLEDhp+9uFqME_E^faV&~9qjBHJkIp~%$x^bN=N)K@kvSVEMdDuzA0sn
z88CBG?`RX1@#hQNd`o^V{37)!w|nA)QfiYBE^m=yQKv-fQF+UCMcuEe1d4BH7$?>b
zJl-r9@0^Ie=)guO1vOd=i$_4sz>y3x^R7n4ED!5o<f!284U3o`_8>XL3@5**h<Hcp
z@=-jpH@#|<>(xr%Hv)_gILarO46q+MaDOF%ChaymKoI6JU5Pg;7#2n9-18|S1;AK+
zgsn6;k6-%!QD>D?cFy}8F;r@z8H9xN1jsOBw2vQONVqBVEbkiNUqgw~*!^##ht>w0
zUOykwH=$LwX<q*Afk5R&Mctbiz>2j&nLy=@{hr)2O&-wm-NyjW7n~Zs9UlH;P7iP3
zI}S(r0YFVYacnKH(+{*)Tbw)@;6>%=&Th=+Z6NHo_tR|JCI8TJiXv2N7ei7M^Q+RM
z?9o`meH$5Yi;@9XaNR#jIK^&{N|DYNNbtdb)XW1Lv2k{E>;?F`#Pq|&_;gm~&~Zc9
zf+6ZE%{x4|{YdtE?a^gKyzr}dA>OxQv+pq|@IXL%WS0CiX!V<r8QLM#*MRM`MfrL^
z$oozeKu(Nq#u9g*IQ%|9o}%#$^xUC(KN(HGggqLu?YF|WP9UQsZcqciD65u58u(o>
zm$fCePA%lU{%pTKD7|5NJHeXg=I0jL@$tOF@K*MI$)f?om)D63K*M|r`gb9edD1~Y
zc|w7N)Y%do7=0{RC|AziW7#am$)9jciRJ?IWl9PE{G3U+$%FcyKs_0Cgq`=K3@ttV
z9g;M!3z~f_?P%y3-ph%vBM<NK?r_G3UHy@sF~h!`(w;SXKRHdJ%HcvAvAAT4Qk5qo
zMgavqbI6bm$F8YQRbu-Q60TKKDjBrG#E!{k)-_O9;f|9i`Bf)1frR1W)S8zwM~^w3
zgTV=3ki=7ABU#jjo~7zOCe&Y02bU{vhUnu>eS@p7P&Ea8M@97+%XEj*(1E6vHj==d
zjsoviB>j^$_^OI_DEPvFkVo(BGRo%cJeD){6Uckei=~1}>sp299|IRjhXe)%?uP0I
zF5+>?0#Ye}T^Y$u_rc4=lPcq4K^D(TZG-w30-YiEM=dcK+4#o*>lJ8&JLi+3UcpZk
z!^?95S^C0ja^jwP`|{<+3cBVo<?ZqT?*63G^-kLfa79`_!t|3XN?#4oHPT!4lKDlS
z&|`dJ%anWz)cD{Izk)|S%W;K%rEUv+1C{y)t{{-Ax56UAAwT>g$(mRdQmadS+Vh~z
zS@|P}=|z3P6uS+&@QsMp0no9Od&27O&14zHXGAOEy<!&qoA;BoYe5Gd6XaoKZz3O>
zh~OKpymK5C%;LLb467@KgIiVwYbYd6wFxI{0-~MOGf<ue$0Kq0FKpj=H%&N{0wz|#
zv3H@VVff^)dR)T~U^4qB#5GIMnxWTKM_`%1WhbW1RX5wx$kbFST`O+(C#(QYZ1qYr
zH@an#>Tq$nBTB!{SrW<fHPfZ<Irz+G=%+OcFrK=FGzu_OrmZYz+$?F}3ABX2OV4Ts
zsD|%iq->mL9H<Tr^2xz>s}C&l&l#m?s*{tA?BHS4mVKHAVMqm63H<|c5n0~k)-kbg
zXidai&9ZUy0~WFYYKT;oe~rytRk?)r8bptITsWj(@HLI;@=v5|XUnSls7$uaxFRL+
zRVMGuL3w}NbV1`^=Pw*0?>bm8+xfeY(1PikW*PB>>Tq(FR`91N0c2&>lL2sZo5=VD
zQY{>7dh_TX98L2)n{2OV=T10~*YzX27i2Q7W<Ntl33%|cp7@_XQ%`!?JFZGy3th+(
z9`69=1diKn`u=0X>86M4$?gZIXZaBq#sA*{PH8){|GUi;oM>e?ua7eF4WFuFYZSG|
zze?srg|5Ti8Og{<A|-y4P^Vw-HZz`B)6iIj60J8RmoI9z56$O?KklBRh#A8wwXksP
zIQ{KLcY4jo=L=|_hbacrV%Qq6!Kf@BT^Q2N;#Sl~J=F}P(ian;6LK=Pia-a<t*`EG
zsvh#5MX1PG_H?pP{~kcN#Yl5&$-{v%{fm#~)VR&aYV*QjB+l!bVIL2ZIK*(o)mg>O
zeFx<XJQO~(UxB;12D)?%fz2kHQ0)}w{#l!~t+-NWg^{L<(tn&kN>uw9!U+zhyk?@w
zjsA6(oKD=Ka;A>Ca)oPORxK+kxH#O@<!C?nm{!PyX#FIJ5Fce^7Glx51z4q!IF-9T
z(}u{s9F?k?->zhC!!XS4@=swnuMk>t+JmLmFiE^1aX3f<)D@`%K0FGK^gg1a1j>zi
z2KhV>sjU7AX3F$SEqrXSC}fRx64GDoc%!u2Yag68Lw@w9v;xOONf@o)Lc|Uh3<2<E
zaTlN?EK~WL>1ctTYu-mFZuHk*+R{GjXHIGq3p)tFtQp%TYqD=j1&y)>@zxoxUJ!G@
zgI0XKmP6MNzw>n<q>RxK$-Gbzs}dyfFzt>#5;f6oR27ql!%+{tr+(`(>%51|k`ML}
zY4eE)Lxq|JMas(;JibNQds1bUB&r}ydMQXBY4x(^&fY_&LlQC)3hylc$~8&~|06-D
z#T+%66rYbHX%^KuqJED_wuGB+=h`nWA!>1n0)3wZrBG3%`b^Ozv6__dNa@%V14|!D
zQ?o$z5u0^8`giv%qE!B<jmp7|q6krbi_G>zZ!3j;BlDlJDk)h@9{nSQeEk!z9RGW)
z${RSF3phEM*ce*>Xdp}585vj$|40=&S{S-GTiE?Op*vY&Lvr9}BO$XWy80IF+6@%n
z5*2ueT_g@ofP#u5pxb7n*fv^Xtt7&?SRc{*2Ka-*!BuOpf}neHGCiHy$@Ka1^Dint
z;DkmIL$-e)rj4o2WQV%Gy;Xg(_Bh#qeOsTM2f@KEe~4kJ8kNLQ+;(!j^b<yk)2Ga*
z`Da!i<uGd>gJMcNhvklP5Z6I+9Fq@c&D~8Fb-4rmDT!MB5QC{Dsb;BharP*O;SF4&
zc$wj-7Oep7#$WZN!1nznc@Vb<_Dn%ga-O#J(l=OGB`dy=Sy&$(5<R7(thvp%XRU@s
zvD!RD|Ap4-iUR15Zc?p`W5PfYcwl?pqP=!?Ly8YQJV-_=LA<5F14$ueWH9bhJ%_)w
z7V5=v(sMJKH%(v>-n3zzu%d7E#^8`T@}V+5B;PP8J14#4cCPw-SQTdGa2gWL0*zKM
z#DfSXs_iWOMt<QB?xsXtvjwXo?shW#x+{vW<m&Lb0}Ev3ll-+Cv)H2yx72H2vfbao
z)&}#dlu-#QrCe#mI_dwQZ8aV{yR-kuQpUpp0FeFvxZ;1G7pk86VlScc8cAV!@aPt-
zLbAEaXYm_LG-m+FWTqvpGKKfn>)0*+Y>Lkd=LlyoHjublNLefhKBv@JoC>P7N1_#>
zv=mLWe96%EY;!ZGSQDbZ<UL<`wO?;KrMew|zF)HM0NF!*C)o2Xb2C86^s8tmic{Un
z9|@ov+&qEN=O7wL#??)8qe;19z|E|XhPC!>Wb#;tzqAGgx~uk+-$+2_8U`!ypbwXl
z^2E-FkM1?lY@yt8=J3%QK+xaZ6ok=-y%=KXCD^0r!5vUneW>95PzCkOPO*t}p$;->
ze5j-BLT_;)cZQzR2CEsm@rU7GZfFtdp*a|g4wDr%8?2QkIGasRfDWT-Dvy*U{?IHT
z*}wGnzdlSptl#ZF^sf)KT|BJs&kLG91^A6ls{CzFprZ6-Y!V0Xysh%9p%iMd7HLsS
zN+^Un$<a&ma`v<=_o?VGrwUGJw=O>tDV)T@i!v?3o0Fsx2qI(AX_$dDkBzQ@fRM%n
zRXk6hb9Py#JXUs+7)w@eo;g%QQ95Yq!K_<hHRNS7N*hWtqjDKgW@A%kX02U4KOU=O
zvdAz#;kT}}XkJ!QzU;bHk8JAq+{_%?pw^sYL|x|PeydT9?IGU9Y51^30i-q1#6m%s
zrcB|gv?mw<4R{=sU0@i6UC;{u0I&8wO=u?6r+b>d=z{0dGS+pToEI6=Bo8+{k$7&Z
zo4>PH(`ce8E-Ps&uv`NQ;U$%t;w~|@E3WVOCi~R4oj5wP?%<*1C%}Jq%a^q~T7u>K
zML5AKfQDv6>PuT`{SrKHRAF+^&edg6+5R_#H?Lz3iGoWo#PCEd0DS;)2U({{X#zU^
zw_xv{4x7|t!S)>44J;KfA|DC?;uQ($l+5Vp7oeqf7{GBF9356nx|&B~gs+@N^gSdd
zvb*>&W)|u#F{Z_b`f#GV<S^Bd2~Pyg*5%eHlTUz}@u5PV*7l^~{G24{Qqrx`@++o~
ztboMW3urCbjTB~&;i*a|(eC0qz31>tQ`pYv3#||N{xj1NgB<#=Odt6{eB%#9RLt5v
zIi|0u70`#ai}9fJjKv7dE!9ZrOIX!3{$z_K5FBd-Kp-&e4(J$LD-)NMTp^_pB`RT;
zftVVlK2g@+1Ahv2$D){@Y#cL#dUj<nH(n~Edj4!|Skt}P+Crb$TxBzD_v2O|_fXDf
zyOhDlptC3q?wEW8BD4DYDy}&OXN4=Gin3Clovb(?fZbl&a4@HKR2*IMYDM~#=eox}
zr@@VgW<o#eD*<OYTh-Dp*Y9Q~ulW+qlsH#Z{CRf%{8T(JHO1!sg=2L#MeC>9*&%#6
zd2m9{1NYp>)6=oAvqdCn5#cx{AJ%S8skUgMglu2*IAtd+z1>B&`MuEAS(D(<6X#Lj
z?f4CFx$)M&$=7*>9v1ER4b6!SIz-m0e{o0BfkySREchp?WdVPpQCh!q$t>?rL!&Jg
zd#heM;&~A}VEm8Dvy&P|J*eAV&w!&Nx6HFV&B8jJFVTmgLaswn!cx$&%JbTsloz!3
zMEz1d`k==`Ueub_JAy_&`!ogbwx27^ZXgFNAbx=g_I~5nO^r)}&myw~+yY*cJl4$I
znNJ32M&K=0(2Dj_>@39`3=FX!v3nZHno_@q^!y}%(yw0PqOo=);6Y@&ylVe>nMOZ~
zd>j#QQSBn3oaWd;qy<AH3+eYXyF}9Xqs&`j)Js;7`deezzO@(if{@wtUbaD%FKTU}
z5w@ZwEG`RDwK>$&5(5H$Ayi)0haAYO6TH<z?6tyk$+6-5SvI&DI1q3_)(4fdZ5Se9
zBbX2Qt9$@_(SU-uL9$`(89L9b^PH`sRc(S;t{a+9w+wmDT>>FR?rhqHmNOO+(})NB
zLI@B@v0)eq!ug`>G<@htRlp3n!EpU|n+G+AvXFrWSUsLMBfL*ZB`CRsIVHNTR&b?K
zxBg<HRwQH-Pd9_*;+0`Ns@&d6=NP#dv8(oIe;Q1jESZF+juys7ECR&=R-98_-67uO
z3$~aVQBexDk*n5GG0IbgeygBsp6J9Ak`^z@J>sN0BjfB>UVcJ|x%=-zb%OV7lmZc&
zxiupadZVF7)6QuhoY;;FK2b*qL0J-Rn-8!X4ZY$-ZSUXV5DFd7`T41c(#lAeLMoeT
z4%g655v@7AqT!i@)Edt5JMbN(=Q-6{=L4iG8RA%}w;&pKmtWvI4?G9pVRp|RTw`g0
zD5c12B&A2&P6Ng~8WM2eIW<Jfbv|J{U>=wxd?r7A*N+&!Be7PX3s|7~z=APxm=A?5
z<Ct?-npDR`F>t>xB4WG|*Td@VX{Rs)PV0|yK`oI3^xn(4c_j&vgxk_Y3o(-`_5o`V
zRTghg6%l@(qodXN;dB#+OKJEEvhfcnc#BeO2|E(5df-!fKDZ!%9!^BJ_4)9P+9Dq5
zK1=(v?KmIp34r?z{NEWnLB3Px{XYwy-akun4F7xTRr2^zeYW{gcK9)>aJDdU5;w5@
zak=<+-PLH-|04pelTb%ULpuuuJC7DgyT@D|p{!V!0v3KpDnRjANN12q6SUR3mb9<-
z>2r~IApQGhst<D{{Vf5G$!J9FyJ-E-*A*EoNl_C#=MT@p=Xle}_jo(=^L>Z!3*?5V
z8#)hJ0TdZ<kpQa~V_Tj9(@ze|2#~^ENp?G7Jt@Eflo`qt*qnbcmXouacBm8O7C@Os
z{Du2beNd5?Za)HL!ZE%zpwg!kyVuKIF9N=(m23`w*$5>g0M-BK#nGFP>$i=qk82DO
z7h;Ft!D5E15OgW)&%lej*?^1~2=*Z5$2VX>V{x8SC+{i10BbtUk9@I#Vi&hX)q<DD
zB<)Sw)<~oUy)n6wE>Q!LwySI{Bnv%Sm)yh{^sSVJ8&h_D-BJ_YZe5eCaAWU9b$O2c
z$T|{vWVRtOL!xC0DTc(Qbe`ItNtt5hr<)VijD0{U;T#bUEp381_y`%ZIav?kuY<v6
zX67v2C6cbIOG)5k!|qC-WsWgOfACm-q$QibFt!z9&vuM4Ra<@t5?aicE~iV+3AvjG
zIg1RVRV7=ipka+$YiU^iebt34R#X_ZP(5j+8#<1x%dt8dn8LWW*;+`tjJbED5GP+(
z;WzH$-`*O}!b)z&?E7%4&3EtV-<sTj?CD<=L+<P?A5!5CSPGi)w4A<JPgzq;6rR*O
zXNdyY$eTaKl$~z~drmdabQ@3giDtf2>G{iyYdEBPW=*xNSc;Rlt6~F4M`5G+VtOjc
z*0qGzCb@gME5udTjJA-9O<&TWd~}ysBd(eVT1-H82-doyH9RST)|+Pb{o*;$j9Tjs
zhU!IlsPsj8=(x3bAKJTopW3^6AKROHR^7wZ185wJGVhA~hEc|LP;k7NEz-@4p5o}F
z`AD6naG3(n=NF9HTH81=F+Q|JOz$7wm9I<+#BSmB@o_cLt2GkW9|?7mM;r!JZp89l
zbo!Hp8=n!XH1{GwaDU+k)pGp`C|cXkCU5%vcH)+v@0eK>%7gWxmuMu9YLlChA|_D@
zi#5zovN_!a-0?~pUV-Rj*1P)KwdU-LguR>YM&*Nen+ln8Q$?WFCJg%DY%K}2!!1FE
zDv-A%Cbwo^p(lzac&_TZ-l#9kq`mhLcY3h9ZTUVCM(Ad&=EriQY5{jJv<5K&g|*Lk
zgV%ILnf1%8V2B0E&;Sp4sYbYOvvMebLwYwzkRQ#F8GpTQq#uv=J`uaSJ34OW<hi?`
zP14gpnLmL&7H`p4L%8;nC1<^Sv9p;w<Wt?|_#g6*20U1pM3nHRfS>ITeSGo6+-8Xw
znCk*n{kdDEi)Hi&u^)~cs@iyCkFWB2SWZU|Uc%^43ZIZQ-vWNExCCtDW<BUzywAz*
z=(vxHv4d&65KOllAq-#$@U{@M^Tie>jqHs;;tWf$v{}0{p0Rvxkq``)*>+Akq%|Na
zA`@~-Vfe|+(AIlqru+7Ceh4nsVmO9p9jc8}HX^W&ViBDXT+uXbT#R#idPn&L>+#b6
zflC-4C5-X;kUnR~L>PSLh*gvL68}RBsu#2l`s_9KjUWRhiqF`j)`y`2`YU(>3bdBj
z?>iyjEhe-~$^I5!nn%B6Wh+I`FvLNv<KrLHuL|CB_`07?lkcY;`F}7N|9w?h$j;W(
z!pz0d;6Gc;=tP?z1|!0VS^mTNfuvL}h&K?b1^iwS6ciDpxQaBY5Gc}49BtNL@wSAH
zN-`fR84|MY8{n7xC}ub4B$LcEGUf*6``pjVtH+rgy&k|kpb4%YlJ~9w&{2Xuzeu1M
zq`UMUPdX@*+$axeLs?$}*bD{+cnrR~Y#}m-O=_R~Wti_#iWT_s(=ymH^VTEl)dozx
zf?Q;WOl1sf3*~dyaUWrzpj(B{DD^$`<1{1iWu+66OaaANG(C3>auve~eX<+Ipl&04
zT}};W&1a3%W?dJ2=N#0t?e+aK+%t}5q%jSLvp3jZ%?&F}nOOWr>+{GFIa%wO_2`et
z=JzoRR~}iKuuR+azPI8;Gf9)z3kyA4EIOSl!sRR$DlW}0>&?Gb<F6G*f~S_Xmd?&<
zcTp7FM31gIFJU_kMTKVvYfdLV&qNi%10o_cXO}T8c0JA~S7IvYJW{2Mc@QfYQPg<B
zM^$g#zoTZ3_m+CrHFauLo}15?s=HP)?JB{H_0#&&1l#xC0PUJEil&(!l3&4$4Oabl
z(t*+m#j)cGDV4roRR~6gh^^UTE<ORsZfX<yztwQw%!x#Q2cf-H-Ei*R_oEBu0*M}r
zaR*fDOfo;PPv!PbT7OdxPFjDUG&0BrIox?s;EQMl0XBMs85AAF!I(xegz)j_*&}F&
zVxM4&whWS?c~-!t@$rAV{-U7+LEraxy0yVT-$%W;DztTaLwSHk;hmJtJCfzd$8ZUU
zM49WF9Y$VtKqVmyl%^86>gPojmj<IO9KHY)gcrh=qY_}jG!}())PpS;BXBA!e*hSR
zilZbQ&4Xd81)(e#05gdbS)_Rc7=w(fM<O8%<WUPqvy2OZsgKBL!XxkiWU2;{7$;C6
z9R+3;R|H$*pUT7|00m@1wlw|z2Pi1^Q3j9tQwHGtFdr%Y_fp{BLtn(*#K`48rPtM-
zeUXnbzjJ6`4-eFtz^q{qhyCKLVL%|Li&oS2mxY?F!w9Q6rOe*>mnln;cTqCt=ADbE
zZ8GAnoM+S1(5$i8^O4t`ue;vO4i}z0wz-QEIVe5_u03;}-!G1NyY8;h^}y;tzY}i5
zqQr#Ur3Fy8sSa$Q0ys+f`!`+>9WbvU_I`Sj;$4{S>O3?#inLHCrtLy~!s#WXV=oVP
zeE93*Nc`PBi4q@%Ao$x4lw9vLHM!6mn3-b_cebF|n-2vt-zYVF_&sDE--J-P;2WHo
z+@n2areE0o$LjvjlV2X7ZU@j+`{*8zq`JR3gKF#EW|#+{nMyo-a>nFFTg&vhyT=b}
zDa8+v0(Dgx0yRL@ZXOYIlV<CBxH&myw-{Mgw|3Z>SZ0|MFizy0VPW8;AfA5|pe!#j
zX}Py^8fl5SyS4g1WSKKtnyP+_PoOwMMw<ZOz9;D51vwfT?(7P{J9uC+df4xbrr2zV
zB?xsPrQn{*Mv;KOgSS&5@+LuXkea1)Zq>u`(i@Z)diJp~U54*-miOchy7Z35eL>^M
z4p<-aIxH4VUZgS783@H%M7P9hX>t{|<kmOhY+3PLOlAj^>RU7$n4T(brCG#h9e9p!
z+o`i;EGGq3&pF;~5V~eBD}lC)>if$w%Vf}AFxGqO88|ApfHf&Bvu+xdG)@vuF}Yvk
z)o;~k-%+0K0g+L`Wal<EEr#pT9&cB!85zTR)doGHYQA;Y4iG{j=~O<Gh^(9@<v8&o
z)RDtDdQW7hx3ZX>a!$=ZV|z$e%>f0%XoLib%)!R^RoS+{!#X?h-<kGFCJU+*P>6uu
zF&&KxORdZU&EwQFITIRLo(7TA3W}y6X{?Y%y2j0It!ekU#<)$qghZtpcS>L3uh`Uj
z7GY;6f$9qKynP#oS3$$a{p^{D+0oJQ71`1?OAn_m8)UGZmj3l*ZI)`V-a>MKGGFG<
z&^jg#Ok%(hhm>hSrZ5;Qga4u(?^i>GiW_j9%_7M>j(^|Om$#{k+^*UL<NK}BVp#gd
zBD%KcbS)b{|CXjYx_P3_eGPFMMz1_Nk540?Us_HPBMy;cERLy0SxfOpX5xzV&cl#)
zt&&6pjk~F}sDOyA9s+XeyTf%9%I=S3nCcDp2Af;QpT#R>nEgzW_1gCICtAD^WpC`A
z{9&DXkG#01Xo)U$OC(L5Y$DQ|Q4C6CjUKk1UkPj$nXH##J{c8e#K|&{mA*;b$r0E4
zUNo0jthwA(c&N1l=PEe8Rw_8cEl|-eya9z&H3#n`B$t#+aJ03RFMzrV@gowbe8v(c
zIFM60^0&lCFO10NU4w@|61xiZ4CVXeaKjd;d?sv<gv6;u6elr3xr~p;CLrK1=+ZBw
z$Y<W|q#kt(G{*D&Qkw-trQSG}>52XM*lS8XiVjgWpRB;&U_C0g+`6B5V&w|O6B*_q
zsATxL!M}+$He)1eOWECce#eS@2n^xhlB4<_Nn?yCVEQWDs(r`|@2GqLe<#(|&P0U?
z$7V5IgpWf09uIf_RazRwC?qEqRaHyL?iiS05UiGesJy%^>-C{{ypTBI&B0-iUYhk>
zIk<5xpsuV@g|z(AZD+C-;A!fTG=df1=<%nxy(a(IS+U{ME4ZbDEBtcD_3V=icT6*_
z)>|J<Y(kIYsG`z6gyX&}qOiN*?9OL086m5X{hq6Ig4#W@iNENBRsw2^%kU7#L2|pp
zfdZ~o%7N|AM$3+2Q82nCBw2dK>?>&6%nvHhZERBtjK+s4xnut*@>G<VN!iYC0JC0F
z7C4lQZ~9^GNSctWumcZ#1X037wz24bZkA{*jo3r`%UXDS^K_+8hV8{gw8t#vVSJ;%
z9c?Ez0!rvl#uwpap|4<*d9UuKSjWn24OFIi&rE}!=sVnu59lQ~@4HIl&h;AmLia?2
zUL-?{m1bz0QDnL&$JD)qE4*iTL#%6dSw8A41Gt4doXWr-`$rleCkVKi0#mfRM!)mY
zIRoubkmw30NIQhtCJYLo7$LOA<y9EL8N*i`%g_PIUD8SHni$er{_LTTN*Wqfx<buS
zb$(_9;wX-c*`xTSvy6C@PDNyy2YoKV;`dWQKiLwa26g^kXh`=t6PlPNT~Q9So~U*X
z3nx6eVXazyi3=nJQw6@jVna;b;RTqKnF0f_=;<}6<0^@)YUgQHPBXXoP$FjaOB^=t
zP%99lNT=pyPA&53Wj%hQT>AmA5m*OTp$!^CHTr}vM4n(X1Q*;{e-Rd2BCF-u@1ZGm
z!S8hJ6L=Gl4T_SDa7Xx|-{4mxveJg=ctf`BJ*fy!yF6Dz&?w(Q_6B}WQVtNI!BVBC
zKfX<>7vd6C96}XAQmF-Jd?1Q4eTfRB3q7hCh0f!(JkdWT5<{iAE#dKy*Jxq&3a1@~
z8C||Dn2mFNyrUV|<-)C^_y7@8c2Fz+2jrae9deBDu;U}tJ{^xAdxCD248(k;dCJ%o
z`y3sADe>U%s<JWQeHbE;y_*tD>uxwwv~8A1+R$VB=Q?%U?4joI$um;aH+eCrBqpn-
z%79D_7rb;R-;-9RTrwi9dPlg8&@tfWhhZ(Vx<DUpdj8XcHW`q{6yqiIzcmGmhsxE(
z@-zez1_>&1PQ+6(huX`;M9x~LrW~~#3{j0Bh2kDU$}@!fFQej4VGkJv?M4rU^x!RU
zEwhu$!CA_iDjFjrJa`aocySDX16?~;+wgav;}Zut6Mg%C4><PTsD!{kIiIRy9=1Aw
zY;M8Eqj=FH#bx<r<6@cBrQ0UMw6e8^v|*8L6vZKlp_PoKF*td|-B1-bh(bXl;KGoy
ze|2jEKeeu5MGO@KpjsEha>}8FL?8)Kgwc(Qlj{@#2Pt0?G`$h7<q&K~4OS{ORv@L7
z>P#M+qoXtlV@d}%c&OzO+QYKK`kyXaK{U(O^2DyIXCZlNQjt0^8~8JzNGrIxhj}}M
z&~QZlbx%t;MJ(Vux;2tgNKGlAqphLq%pd}JG9uoVHUo?|hN{pLQ6Em%r*+7t^<);X
zm~6=qChlNAVXNN*Sow->*4;}T;l;D1I-5T{Bif@4_}=>l`tK;qqDdt5zvisCKhMAH
z#r}`)7VW?LZqfdmXQ%zo5bJ00{Xb9^YKrk0Nf|oIW*K@(=`o2Vndz}ZDyk{!u}PVx
zzd--+_WC*U{~DH3{?GI64IB+@On&@9X>EUAo&L+G{L^dozaI4C3G#2wr~hseW@K&g
zKWs{uHu-9Je!3;4pE>eBltKUXb^*hG8I&413)$J&{D4N%7PcloU6bn%jPxJyQL?g*
z9g+YFFEDiE`8rW^laCNzQmi7CTnPfwyg3VDHRAl>h=In6jeaVOP@!-CP60j3+#vpL
zEYmh_oP0{-gTe7Or`L6x)6w?77QVi~<VDx`fw;L`VTJ3A_($%0y%60sa5-^3a6v=d
zmmak>jD8lWN@3RHcm80iV%M1A!+Y6iHM)05iC64tb$X2lV_%Txk@0l^hZqi^%Z?#-
zE;LE0uFx)R08_S-#(wC=dS&}vj6P4>5ZWjhthP=*Hht&TdLtKDR;rXEX4*z0h74FA
zMCINqrh3Vq;s%3MC1YL`{WjIAPk<IG3Wg4CtfDQ*vr2eaPwC^O8I68!!&nV4u+r_F
z?Axith|qN&KZ99L>VL#3rj^9Pj9Ss7>7duy!9H0vYF%>1jh)EPqvlr6h%R%CxDsk|
z!BACz7E%j?bm=pH6Eaw{+suniuY7C9Ut~1cWfOX9KW9=H><&kQlinPV3h9R>3nJvK
z4L9(DRM=x;R&d#a@oFY7mB|m8h4692U5eYfcw|QKwqRsshN(q^v$4$)HgPpAJDJ`I
zkqj<O$?%~S@(l(Nhxp_sGA`$A%xJ7kG%+@Mg|5Rt6m|$*ZUs_O&tJj;*GHA8ccUky
ztdHVq5%mDvVz0+qo6-yO?&N>q(8Cd!K!+wCd=d@w%~e$=gdUgD&wj$LQ1r>-E=O@c
ze+Z$x{>6(JA-fNVr)X;*)40Eym1TtUZI1Pwwx1hUi+G1Jlk~vCYeXMNYtr)1?qwyg
zsX_e*$h?380O0<gk^3K^_@6N5|2rtFRkhT9jP1T;nZ|k<{vtt&=cCpw1{DoTNi|6*
zNR~&3M3w}sVsT4{Fd&%jPlD{py<BN_zia9hI9Ip7U3}jPq_+H)r9(in7-!PpJihq-
zw0+F|?s|8=Bj*cv!`7|95cO>0ou?0R@7-Fc59o$UvyVs4cUbujHUA>sH!}L54>`e`
zHUx#Q+Hn&Og#YVOuo*niy*GU3rH;%f``nk#NN5-xrZ34NeH$l`4@t);4(+0|Z#I>Y
z)~Kzs#exIAaf--65L0U<!Yaeav3)qH(`Lb>HT_SvV8O2WYeD>Mq^Y6L!Xu8%vnp<f
zF-ML?EA-QLj_KdFHPDT9@{&CgOfV=>ofG@w!}R7M28?i1*T&zp3X4^OMCY6(Dg<-!
zXmcGQrRgHXGYre7GfTJ)rhl|rs%abKT_Nt24_Q``XH{88NVPW+`x4ZdrMuO0iZ0g`
z%p}y};~T5gbb9SeL8BSc`SO#ixC$@QhXxZ=B}L`tP}&k?1oSPS=4%{UOHe0<_XWln
zwbl5cn(j-qK`)vGHY5B5C|QZd5)W7c@{bNVXqJ!<MH|d3P$j9&VqkPfJu|wcu=-=7
zLO~2}`PnP-On3_NOs4{hJ&4M7x*M^%g^_s4zus|ykus<=xkIi<3#(~DK73KtIuaza
zq3Ok{j4$g8v7e>!n$^ufc?N9C-BF2QK1(kv++h!>$QbAjq)_b$$PcJdV+F7hz0Hu@
zqj+}m0qn{t^tD3DfBb~0B36|Q`bs*xs|$i^G4uNUEBl4g;op-;Wl~iThgga?+dL7s
zUP(8lMO?g{GcYpDS{NM!UA8Hco?#}eNEioRBHy4`mq!Pd-9@-97|k$hpEX>xoX+dY
zDr$wfm^P&}Wu{!%?)U_(%Mn79$(ywvu*kJ9r4u|MyYLI_67U7%6Gd_vb##Nerf@>&
z8W11z$$~xEZt$dPG}+*IZky+os5Ju2eRi;1=rUEeIn>t-AzC_IGM-<V%6qUamagiO
zojXi&7L8*IWlJ}@8}`JxyIBN$u|oRzm@w!)u;`JJ^U^Jm{ZDgyeTstN3t7NGnWGqQ
z+EIa8gC_h)Zo3JN!q<7m%o03s)htD&NG?lXEop9Gx?#VaLBB3SgcFmh3F=U430hIe
z6>IXWK3^6QNU+2pe=MBn4I*R@A%-iLD<B3fBYdI`-^xCP#WQb@1Qb6Wc1|7$rrI$0
zS^}=1l87G|X!N)P#&2-ZEFApzIK5kn5Gfb61a_<ma9;3$9f;mz8FokB;4w(Y@!^ya
zu;TjxPaQyO7n{FcIlqEr<3P?P4i)3rOGc5I4};xC;qQa&0^@jzv_HUBAQxZ^AEFm5
zBTlS@?>COHTE-O^wo$sL_h{dcPl=^muAQb`_BRm};=cy{qSkui;`WSsj9%c^+bIDQ
z0`_?KX0<-=o!t{u(Ln)v><gH1Bl$-3LHEdi^mWSpxtdQaHDvah(~iGD|9egFyL#F~
z?1$GP^^-sUm)!YZ{+<LJ&0K7L{O*24r%mkt6Kzb6pOOS-L<yFAV=ioJ*0iBM>%VGL
z0pC=GB7*AQ?N7N{ut*a%MH-tdtNmNC+Yf$|KS)BW(g<Pdkm%)j?It(+K5~xj$IK!{
zp+6C1gYJS33v`?Xo$~~77W&2eyGU19<m6FlNMw~43kikpZyKWVZ@N_*wAeJPS%sqH
z40>QJ*z$d{+{j?(e&hgTy^2|AR9vx1Xre2fagGv0YXWqtNkg*v%40v?BJBt|f9wX5
z{QTlCM}b-0{mV?IG>TW_BdviUKhtosrBqdfq&Frdz>cF~yK{P@(w{Vr7z2qKFwLhc
zQuogKO@~YwyS9%+d-zD7mJG~@?EFJLSn!a&mhE5$_4xBl&6QHMzL?CdzEnC~C3$X@
zvY!{_GR06ep5;<#cKCSJ%srxX=+pn?ywDwtJ2{TV;0DKBO2t++B(tIO4)Wh`rD13P
z4fE$#%zkd=UzOB<aY_AGMe|>7<sX$yw1SM(554I3!t(H9Z3X!Z5et+k4_h&Thynx!
ziTh5;c|GaYsH@W32G0xdhr&KvFp@Nsj=Jm7RQGd8dhSlG9l$e(+pLSdq<w-kZ7W-l
z4}DfW=)F=&(6?c$=|K`W>4gi=-*CuID&Z3zI^-`4<B}t|X$V`AU_-K%EPFL_9twl_
zImK{&j6sLpH7WP2;B>U^S?dHxK8fP*;fE|a(KYMgMUo`T<LMf=xb>HIS1f!*6dOI2
zFjC3<eBkTsS?s~jirYF@Nrja2AilB1Grgok(Q{J!-yMdCbS>O=-AL`<v=HnY;qaP=
z%r%pyN;W|G{!13k5HgL{&1MZLLmT$ii@iPy4pvmSTAaNGh_&&=IOx}7|G?LkOre&5
ze$)ZX{wLMvAK$Ose+l($RV_DU6^yOKl5CcA-LOWBeC<Ob3qkCr#=PoHO}Zi7*=4i{
zhy`P5@RD?fbbG{Ww-4~nmy*sK0y=*cjfuaB@ZGQIADDEhtuw}4>6=9pp;`CYPTdVX
z8(*?V&%QoipuH0>WKlL8<OkjufLT@|_$itWfOgQQ1sL1my9jh5H&Aqu0?B`H1h6nM
zGosuE%;~qrp)5qiJt5TkvuY!P5&J7R#=`qhgmgE%$-tyU1wv>A*zTKckD!paN@~hh
zmXzm~qZhMGV<pYA7aBO1=rl8ewvtPYii`7)>dQGd=AG8&20HW0RGV8X{$9LldFZYm
zE?}`Q3i?xJRz43S?VFMmqRyvWaS#(~Lempg9nTM$EFDP(Gzx#$r)W&lpFKqc<y(GB
zQ)iBAS`N=DS_sKSXI55P(lHkwH)@Q89oTUc{-vd@Ue^>AoJh-AxEw$-bjW>`_+gEi
z2w`99#UbFZGiQjS8kj~@PGqpsPX`T{YO<m!OQBo2H4HfqK(14BF?XJ?$*LTUG|=XE
z%mXZ(W0fCU1)eodty#XEHD<Wgj8lxa(h;^9fW-UTt2UUeP&QC*>j`CaEqTFag;$jY
z8_{Wzz>HXx&G*Dx<5skhpETxIdhKH?DtY@b9l8$l?UkM#J-Snmts7bd7xayKTFJ(u
zyAT&@6cAYcs{PBfpqZa%sxhJ5nSZBPji?Zlf&}#L?t)vC4X5VLp%~fz2Sx<*oN<7`
z?ge=k<=X7r<~F7Tvp9#HB{!mA!QWBOf%EiSJ6KIF8QZNjg&x~-%e*tflL(ji_S^sO
ztmib1rp09uon}RcsFi#k)oLs@$?vs(i>5k3YN%$T(5Or(TZ5JW9mA6mIMD08=749$
z!d+l*iu{Il7^Yu}H;lgw=En1sJpCKPSqTCHy4(f&NPelr31^*l%KHq^QE>z>Ks_bH
zjbD?({~8Din7IvZeJ>8Ey=e;I?thpzD=zE5UHeO|neioJwG;IyLk?xOz(yO&0DTU~
z^#)xcs|s>Flgmp;SmYJ4g(|HMu3v7#<SE)AQQ4Qptn#EiMmi^p+Ei-7DkP+HtEnTf
z=ihEK*Qz1Ztg&vmf~iSIJvy;UM*1)`@_lXyy`^7!LwVESw8;X~PDDY`x##uAQg`Ks
z=exD#_B;<%p^vKWmo{iCKEmMLveyeSGEVy6BOKJM`)TA4((F&!tIH99l|b!Je=-ox
zTnV0#B=>;c*Aa8iF#UZo7CvDq4>8#qLJ|YdZ!AsH%^_7N1IQjCro<P>K7UpUK$>l@
zw`1S}(D?mUXu_C{wupRS-jiX~w=Uqqhf|Vb3Cm9L=<ElBA0am3@5~_j4>T+w91Cu^
z*&Ty%sN?x*h~mJc4g~k{xD4ZmF%FXZNC;oVDwLZ_WvrnzY|{v8hc1nmx4^<U$9gKZ
zXD;2On@pVz(zumYHw-cDICc(jj*0Lh@n}B_NB#b3S3Y`joU*Hg4D^tTlnR`X)wKiE
z<1(Wo3^4=KAZ|hOq%hItOAv2Yuv6FZW5>}Z;yriXsAf+Lp+OFLbR!&Ox?x<j>ABwl
zu8w&|5pCxmu#$?Cv2_-Vghl2LZ6m7}VLEfR5o2Ou$x02uA-%QB2$c(c1rH3R9hesc
zfpn#oqpbKuVsdfV#cv@5pV4^f_!WS+F>SV6N0JQ9E!T90EX((_{bSSFv9ld%I0&}9
zH&Jd4MEX1e0iqDtq~h?DBrxQX1iI0lIs<|kB$Yrh&cpeK0-^K%=FBsCBT46@h<vzQ
zR*F0_2h6Z11Y9Zzju}{_b;!1TT<IfdPup&BfzJrm?bo16`)8Tm1upGj;JDjh>#<Oy
z8t{(&!50W|bIsS-1avP5+*F7gID<ta6$a=EvLN_Gn#i|E*V;GvV|e9HOQ9zf{;k4c
z6E1|)o(`{GoSr=usmv~(8ivQq9h)k@J&LDI;_p0b!?ZsmFktoKIgcIE+yq!s*%$%)
z#ay6sWeCZZ>yi!AyDq1V(#V}^;{{V<B^hU0(%wtueKZDT`@J-_3mSLDBmaq~d|XD<
z7XM`Dzdz4EMpggFBJu`~KkDTFp@A11CGaoLc9C!MrY0!YzwKdGfx7t=5D@|7VVCk+
z^B@Uld)n>*@T4WJ&U-NTq43w=|K>z8%pr_nC>%C(Wa_l78Ufib$r8Od)IIN=u>417
z`Hl{9A$mI5A(;+-Q&$F&h-@;NR>Z<2U;Y21>>Z;s@0V@SbkMQQj%_;~+qTuQ?c|AV
zcWm3XZQHhP&R%QWarS%mJ!9R^&!_)*s(v+VR@I#QrAT}`17Y+l<`b-nvmDNW`De%y
zrwTZ9EJrj1AFA>B`1jYDow}~*dfPs}IZMO3=a{Fy#IOILc8F0;JS4x(k-NSpbN@qM
z`@aE_e}5{!$v3+qVs7u?sOV(y@1Os*Fgu`fCW9=G@F_#VQ%xf$hj0~wnnP0$hFI+@
zkQj~v#V>xn)u??YutKsX><O0LtJ*CTAYt%cbdwrkH)-u>pxKCl^p!C-o?+9;!Nug^
z{rP!|+KsP5%uF;ZCa5F;O^9TGac=<e#17!Ngq|n{jbexb$Js+_Pf5VvAs8ZxqWBs8
zB0&S13$r1<F)VBk$e5stmC}hMzOSHBA0t(kR!@rbH|QU6*IH0=h|_OeT6wpDmdv23
z34f|!XhwEd(Kkc%FUYK@H{gz+bBn7dd5Fl+jfYh}4;N_qJ7#NlT2O+)R4Pxe>M|=V
z_H(PfkV1rz4jl?gJ(ArXMyWT4y(86d3`$iI4^l9`vLdZkzpznSd5Ikfrs8qcSy&>z
zTIZgWZGXw0n9ibQxYWE@gI0(3#KA-dAdPcsL_|hg2@~C!VZDM}5;v_Nykfq!*@*Zf
zE_wVgx82GMDryKO{U{D>vSzSc%B~|cjDQrt5BN=Ugpsf8H8f1lR4SGo#hCuXPL;QQ
z#~<Fi-k$qL|7RVJ1Mm1Ag<$gB0(}Grl3S(KpYI)5f?u3vC7#V;7XHhGB(-?~L6q0H
z3n(8fq$!!dl{5-8`7Oj@C;&S?r8OD`wLCHxqX2ux!ard>b?C4MoepT3X`qdW2dNn&
z<Us;@%o6IHN`m_LE5@ycr7sZ5iN)2(z?4TB28{O!&JjxPf+H!xrjo)qWAjZifsc&}
zRUE<g)K;7gXjcN#UR5gfzb2nN2z_KhXjz-ToEUcM{^o{zAJg2QnR<7bsn(YUNIKF@
z%f>o8)K}%Lpu>0tQei+{<z4bzXqy$)s2?v=pRVAvTjJpopW)-K>>*VGErz|qjbK#9
zvtd8rcHplw%YyQCKR{kyo6fgg!)6tHUYT(L>B7er5)41iG`j$qe*kSh$fY!PehLcD
zWeKZHn<492B34*JUQh=CY1R~jT9Jt=k=jCU2=SL&&y5QI2uAG2?L8qd2U(^AW#{(x
zThSy=C#>k+QMo^7caQcpU?Qn}j-`s?1vXuzG#j8(A+RUAY})F@=r&F(8nI&HspAy4
z4>(M>hI9c7?DCW8rw6|23?qQMSq?*Vx?v30U%luBo)B-k2mkL)Ljk5xUha3pK>EEj
z@(;tH|M@xkuN?gsz;*bygizwYR!6=(Xgcg^>WlGtRYCozY<<HB2ZgQ)h*70uL|pZ2
z>rFX2E>kaZo)O<^J7a`MX8Pf`gBd4vrtD|qKn&B)C&wp0O-x*@-|m*0egT=-t@%dD
zgP2D+#WPptnc;_ugD6%zN}Z+X4=c61XNLb7L1gWd8;NHrBXwJ7s0ce#lWnnFUMTR&
z1_R9Fin4!d17d4jpKcfh?MKRxxQk$@)*hradH2$3)nyXep<KqaI3?gC#eK%i>5Z;B
z?yX+-Bd=TqO2!11?MDtG0n(*T^!CIiF@ZQymqq1wPM_X$Iu9-P=^}v7npvvPBu!d$
z7K?@CsA8H38+zjA@{;{kG)#AHME>Ix<711_iQ@WWMObXyVO)a&^qE1GqpP47Q|_AG
zP`(AD&r!V^MXQ^e+*n5~Lp9!B+#y3#f8J^5!iC@3Y@P`;FoUH{G*pj*q7MVV)29+j
z>BC`a|1@U_v%%o9VH_HsSnM`jZ-&CDvbiqDg)tQEnV>b%Ptm)T|1?TrpIl)Y$LnG_
zzKi5j2Fx^K^PG1=*?GhK;$(UCF-tM~^=Z*+Wp{FSuy7iHt9#4n(sUuH<I4zjuT^*g
zTOj)T0YzMUS}8lWAh=#M-<;W0su6sG+MC}XSqZj-`H_&HD?2Z|qE8QdIM(X#NffsX
zuwQ<4gm=;OFM%U76&;OT!|+5x?B#F>K??@v+6*|10Csdnyg9hAsC5_OrSL;jVkLlf
zHXIPukLqbhs~-*oa^gqgvtpgTk_7Gy<tuXYyQ=x)UZpuI%V@LXq4|I6hehHuRD*Qk
z+@3~62V_$Rc&ax*Fe`-_r5e~NV8Dh7^Qkc47_lIobqdmfrCP}{2YGKf1@Bh!U!R!l
zVh{a+_vh}x%?yACZT@@6M}$iwzkin{Cukrb?*D6_{O__PU~FPz|81A`Ki4JuH6?`4
zsEYmO+F8y*acunUpGD%Hp$-=5%jS}CI-%I;>pwH><53riYYL*M=Q@F-yEPLqQ&1Sc
zZB%w}T~RO|#jFjMWcKMZccxm-SL)s_ig?OC?y_~gLFj{n8D$J_Kw%{r0oB8?@dWzn
zB528d-wUBQzrrS<hDznyNCgL-qOb|p_nS%fSb5_Ze4V1&Bgk3Vp>SL<Uo&kY+&uyM
z)HW-LQUfYYW$q$nSg9;!DvA>q?fR!K%59Zv9J4yCQhhDGwhptpA5O5U?Hjqt>8nOD
zi{)0CI|&Gu%zunGI*XFZh(ix)q${jT8wnnzbBMPYVJc4HX*9d^mz|21$=R$J$(y7V
zo0dxdbX3N#=F$zjstTf*t8vL)2*{XH!+<2IJ1VVFa67|{?LP<VPiuN~9#0c!YR12a
zo;Lj{EKN(4nyG=Ui#tn@dJ(*;md5Ze)kd6mTF$dEZFv=nYr2Lr(==N%ad&FBajgz2
z6_^{6U)eu_=!Ix#bP)AA_tuIc?3=_Mq548F*;t3>&P41h$2i2;?N~RA30LV`BsUcj
zfO9#Pg1$t}7zpv#&)8`mis3~o+P(DxOMgz-V*(?wWaxi?R=NhtW}<#^Z?(BhSwyar
zG|A#Q7wh4OfK<|DAcl9THc-W4*>J4nTevsD%dkj`U~wSUCh15?_N@uMdF^Kw+{agk
zJ`im^wDqj`Ev)W3k3stasP`88-M0ZBs7;B6{-tSm3>I@_e-QfT?7|n0D~0RRqDb^G
zyHb=is;IwuQ&ITzL4KsP@Z`b$d%B0W<OzS}HhQ;U8}SDa=0^@=1zh?Ebwi(h3u<mG
zLDfZ57#t!A)c-~Y!O{*PdxzDz>uhioo1CWttW8yhsER1ZUZzA{F*K=wmi-sb#Ju+j
z-l@In^IKnb{bQG}Ps>+Vu_W#grNKNGto+yjA)<V{IjYd_d3*YH={C)J_}OyeI6i_H
zu|j9WcfZFU?Sgyac7umALud#l=-0F(fRl)O28qU<%HpKBh#im&8;uWW953cFj~Iy&
zlr`ZkkzG;HtUSI4CRZRj#8C;N*{hU?ezHzcM|gV*@iklj`$SU@402&viLPBGiWFL2
zQ6uyKx<>?>0?~X`4I3T@5G1)RqGUZuP^NJCq&^HykuYtMDD8qq+l8RcZNJsvN(10{
zQ1$XcGt}QH-U^WU!-wRR1d--{B$%vY{JLWIV%P<Wi12|;({s?oRC80?)@wj6UxV~n
zk2qyqDc_<v8oEujhH&m^Jq#Lrp)Z$Oj#rxlpUicHLpt@6dZF=U?wW~T_o?IN;ie!I
zV-bY1+kei0nc9KpW>4-KQuxxDeJaF#{eu&&r!3Qu{w}0f--8^H|KwE>)ORrcR+2Qf
zb})DRcH>k0zWK8@{RX}NYvTF;E~phK{+F;MkIP$)T$93Ba2R2TvKc>`D??#mv9wg$
zd~|-`Qx5LwwsZ2hb*Rt4S9dsF%Cny5<1fscy~)d;0m2r$f=83<->c~!GNyb!U)PA;
zq^!`@@)UaG)Ew(9V?5ZBq#c%dCWZrplmuM`o~TyHjAIMh0*#1{B>K4po-dx$Tk-Cq
z=WZDkP5x2W&Os`N8KiYHRH#UY*n|nvd<ovggige3u`1qGi1+Y8X!3s{W#*m=tX&CV
zNWQ(*z*>(U>yO=MFI-2BEp?x@=N<~CbLJBf6P)}vLS?xJXYJ2^<3KJUdrwKnJnTp{
zjIi|R=L7rn9b*D#Xxr4*R<3T5AuOS+#U8hNlfo&^9JO{VbH!v9^JbK=TCGR-5EWR@
zN8T-_I|&@<P6ysJp1u%bVccl?q?sU4Onn?IFII0`6;jp*_+1Vcjf$mX{%JA^!$Gkf
z>A}(hKeL4_*eb!1G8p~&_Im8|wc>Cdir+gg90n1dw?QaXcx6Op_W1r=axR<Rt0$d-
z&gdORS`9;Z%6j=d$PU%VL0xT-jF-dHo&#w}>w>4;rM*UOpT#Eb9xU1IiWo@h?|5uP
zka>-XW0Ikp@dIe;MN8B01a7+5V@h3WN{J=HJ*pe0uwQ3S&MyWFni47X32Q7SyCTNQ
z+sR!_9IZa5!&GTf&V$`q!%H8ci!a|RMx5}5MA_kr+bhtQy{-^)(hCVa@I!^TV4RBi
zAFa!Nsi3y37I5EK;0cqu|9MR<Vh>j<^r&h1lF}u0KpKQD^5Y+LvFEwM<n%Y4Ns0&r
z#Pgp7tfaM#i}k;d-@gi@qNBc}@xL(OgxbkB%Zc*U!8(yY_d_z4QrJ%DIL^_}pG(C;
zxV&Dt0*#6mW+VnKpUKH&)*t(_EhJ1#-d4~Kom-)N+kGAW3vl$z=E{EB!4#iw1#JGZ
zpZv7B?(+0N;`4s@&;+D$6BOaTPLlV-MY35`gn~5zS!mCgh|W$2sr@*jRa}74{|6)>
zLU@@v4_Na#Axy6tn3P%sD^5P#<7F;sd$f4a7L<t5V42bo`*JV+&3HWm9OI@30?%Oh
z5o+B(*v(C-H_!6}Lzhp-kE~j|H(u&BA@KX~k?60QV5NR)N2OJYIOG(f(FG`kmvdU7
zwM#zp&<w6$6785wBe4}t?5yT4MP5N47S8;*P_q6hn|Wj2S~%IPE(O9P2?RAKY>BMk
zGU^RZHBcxSA%kCx*eH&wgA?Qwazm8>9SCSz_!;MqY-QX<1@p$*T8lc?@`ikEqJ>#w
zcG``^CoFMAhdEXT9qt47g0IZkaU)4R7wkGs^Ax}usqJ5HfDYAV$!=6?>J6+Ha1I<5
z|6=9<InPN?WUE(MBZF{EDJ@lH!7KCH_7xC@JvfZL@&nc!7K2v>soU4>E))tW$<#>F
ziZ$6>K<f#VmS*<VCLk5Snrr-`d{Bp+A{=r<v#~0tw_zC-WYWg-s*<dPsHVYZm|7R#
z>Jf0bPfbx_)7-}tMINlc=}|H+$uX)mhC6-Hz+XZxsKd^b?RFB6et}O#+>Wmw9Ec9)
z{q}XFWp{3@qmyK*Jvzpyqv57LIR;hPXKsrh{G?&dRjF%Zt5<eso5q2Qq7|ChevXoo
zUUuocw%BMFdbc16MLS>&m20Ll?Oy<ul%w2Zua&zkQjQpssgWs#RKAL}6i`eHUzx7p
zSoNx{P@%ayUjixVqBLi(th$z4mR4dC*OaQENb9y_y<R>fUYC3WRn{cgQ?^V~UAv+5
z&_m#&nIwffgX1*Z2#5^Kl4DbE#NrD&Hi4|7SPqZ}(>_+JMz=s|k77aEL}<=0Zfb)a
z%F(*L3zCA<=xO)2U3B|pcTqDbBoFp>QyAEU(jMu8(jLA61-H!ucI804+B!$E^cQQa
z)_ERrW3g!B9iLb3nn3dlkvD7KsY?sRvls3QC0qPi>o<)GHx%4Xb$5a3GBTJ(k@`e@
z$RUa^%S15^1oLEmA=sayrP5;9qtf!Z1*?e$ORVPsXpL{j<cf(aoOxSwSjBR1mea0e
z^c3Q=wn8)%*koW31D%}j9dO0gR7Y@b_00J@vMv~iauOuPVkQGKG^LpkGueXj*z<(H
zv6PsTl$Bd^g^4yr-LK|PFHGhgs2bdrT$8)TR7nc(mnkPw6vrbaA_vA0JdMNVsoX7X
zzI)+QrS1A&rLp4hm_yCc#tP)G{!hBfUV@*{)uh;EQVyvIbs@16JE!9Psq3UaXMGQ^
zJ^Y2zHl3Zl1idw?=a56!^Y{sB%B-k(>L<6E)0sj&swP3}NPmR%FM?O>SQgN5XfHE<
zo(4#Cv11(%Nnw_{_Ro}r6=gKd{k?NebJ~<~Kv0r(r0qe4n3LFx$5%x(BKvrz$m?LG
zjLIc;hbj0FMdb9aH9Lpsof#yG$(0sG2%RL;d(n>;#jb!R_+dad+K;Ccw!|RY?uS(a
zj~?=&M!4C(5LnlH6k%aYvz@7?xRa^2gml%vn&eKl$R_lJ+e|xsNfXzr#xuh(>`}9g
zLHSyiFwK^-p!;p$yt7$F|3*IfO3Mlu9e>Dpx8O`37?fA`cj`C0B-m9uRhJjs^mRp#
zWB;Aj6|G^1V6`jg7#7V9UFvnB4((nIwG?k%c7h`?0tS8J3Bn0t#pb#SA}N-|45$-j
z$R>%7cc2ebAClXc(&0UtHX<>pd)akR3Kx_cK+n<}FhzmTx!8e9^u2e4%x{>T6pQ`6
zO182bh$-W5A3^wos0SV_TgPmF4WUP-+D25KjbC{y_6W_9I2_vNKwU(^qSdn&>^=*t
z&uvp*@c8#2*paD!ZMCi3;K<prH4a&u_&GPH7Wp*~+CzWZpzTbL&)4k=HC|E1=uT#O
zPuYcj5Jbb}c9+j=4foN!N-fSV`(c<30`iukl213!u4qI^cD3Ytg!~P3N1S(`-3^yF
zlFzoPUGJT0f@f{`ZDabbR@mNt3*M;G<P@?W{8}}Lg`Hvy7&Jmu=04fd1erYTPEwb=
z15%g^dl!<DS<}y*RUs*g6?{>{Na;I4Q35zw$YrW5U@Kk~)&rw;G?d7Q&c9|x<<Maw
zgBLn8*CoGPVj_^rpPn~ENvB}-`?vqbUaC%e+l73U+D^-Cb&Pj13W#6g!+x^$_tX)*
z@cs9Wk+{Dzx8NJw-G7(M|EOb>Hg|CNMsxovmfth*|E*GHezPTWa^Hd^F4!B3sF;)?
z(NaPyAhocu1jUe(!5Cy|dh|W2=!@fNmuNOzxi^tE_jAtzNJ0JR-avc_H|ve#KO}#S
z#a(8secu|^Tx553d4r@3#6^MHbH)vmiBpn0X^29xEv!Vuh1n(Sr5I0V&`jA2;WS|Y
zbf0e}X|)wA-Pf5gBZ>r4YX3Mav1kKY(ulAJ0Q*jB)YhviHK)w!TJsi3^dMa$L@^{`
z_De`fF4;M87vM3Ph9SzCoCi$#Fsd38u!^0#*sPful^p5oI(xGU?yeYjn;Hq1!wzFk
zG&2w}W3`AX4bxoVm03y>ts{KaDf!}b&7$(P4KAMP=vK5?1In^-YYNtx1f#}+2QK@h
zeSeAI@E6<gTYtZud5<DeMJ>Z8a?)>sZ`fbq9_snl6LCu6g>o)rO;ijp3|$vig+4t}
zylEo7$SEW<_U+qgVcaVhk+4k+C9THI5V10qV*dOV<w;)q>6pPtAI$)QN{!JRBKh-D
zk2^{j@bZ}yqW?<#VVuI_27*cI-V~sJiqQv&m07+10XF+#ZnIJdr8t`9s_EE;T2V;B
z4UnQUH9EdX%zwh-5&wflY#ve!IWt0UE-My3?L#^Bh%kcgP1q{<CvcqqQ2utk7s%sV
zoGl}#Zl)W@RNSAAf;w-DBO+*e0HO2%x-G=Z;*Pl$zHy^xW)%na$gbyTIw>&26eXLn
zTkjJ*w+(|_>Pq0v8{%nX$QZbf)tbJaLY$03;MO=Ic-uqYUmUCuXD>J>o6BCRF=xa%
z3R4SK9#t1!K4I_d>tZgE>&+kZ?Q}1qo4&h%U$GfY058s%*=!kac{0Z+4Hwm!)pFLR
zJ+5*OpgWUrm0FPI2ib4NPJ+Sk07j(`diti^i#kh&f}i>P4~|d?RFb#!JN)~<!~Z`*
zivNy#7kzu<{{tyjD6d)1{g;-B-EK2+0;|?2Nj`=2hUDsRiVj-}RAJN{d@x~38|)#_
zx&F#UxFFdbXxE(|#84p;-%dtBDbgEpl>D@)beox}bw?4VCf^y*`2{4`-@%SFTry2h
z>9VBc9#JxEs1+0i2^LR@B1J`B9Ac=#FW=(?2;5;#U$0E0UNag_!jY$&2diQk_n)bT
zl5Me_SUvqUjwCqmVcyb`igygB_4YUB*m$h5oeKv3uIF0s<pxU7XrV;DR{UhyjHRs5
zb+8Qf7A65FfQ?d1ZT2w}F_l*Eb)?ah<8c%Yy;Eal4{xBsX^nN@Pe5CxcymxUwL?eh
zv9_Z0XXBqZl6EhcKDo~Ou&%?PpG{{$wPe(7oy?yZ1mnWmr0b~pN$igR!(Rx*QN$iy
z=-Re}qI2g(ku?t~HgBj3V=|H$hiN2{j!P%zCB+1x34pnjx#?&{ENcU`o_2tynp}0U
zKI9mTgI{WS`?XY!3FH!0Q>k}~es!{D>4r%PC*F~FN3owq5e0|Y<Du-bB4EU)q{6=q
z#<0gBE8S|!ZrmQeH3JgM^AxLU0k8cAwCY-9?0w8gxwWKqzGP>eUTSG#Vq%&Gk7uwW
z0lDo#_wvflqHeRm*}l?}o;EILszBt|EW*zNPmq#?4A<L0#2f-Fpgzo6i9m?Cv{^Fe
z9>+&i0xx^?9obLyY4xx=Y9&^G;xYXYPxG)DOpPg!i_Ccl#3L}6xAAZzNhPK1XaC_~
z!A|mlo?Be*8Nn=a+FhgpOj@G7yYs(Qk(8&|h@_>w8Y^r<TLc~8#)=w@0;xlrL@mM3
zg*K(X)@-O)lt;P?5e(;WTL%O;a;rQNAE5;DqERSyAXc1biP%NUWXy?=-B^)wQ=+I4
zU%qA-ghSXXn27E3w8NMG!5XHJY>&5nCqe0V60rRz?b5%J;GYeBqSAjo|K692GxD4`
zRZyM2FdI+-jK2}WAZTZ()w_)V{n5tEb@>+JYluDozCb$fA4H)$bzg(Ux{*hXurjO^
zwAxc+UXu=&JV*E59}h3kzQPG4M)X8E*<G*nR0cBsK+3(q5`<{N)Z$_eT#;miD(s%h
z{fdYdgo~K&tWs<DY?yIi#?k!bT;M<ZDoV|<xhf7jcRFXDXl`LtGFz=LPAW$(hAEz}
zq@oGhJoeM0w4KvLTg%>}#_&}w*KEg<F5P|-B$Y<3$zfM|>tX)cU{vm9b$atHa;s>|
z+L6&cn8xUL*OSjx4YGjf6{Eq+Q3{!ZyhrL&^6Vz@jGbI%cAM9GkmFlamTbcQGvOlL
zmJ?(FI)c86=JEs|*;?h~o)88>12nXlpMR4@yh%qdwFNpct;vMlc=;{FSo*apJ;p}!
zAX~t;3tb~VuP|ZW;<Q&V!o{~5>z$=IHf->F@Ml)&-&Bnb{iQyE#;GZ@C$PzE<QuI)
zv_|vm%I_n1Dpq6lr--l%%tq7K!<v~?55k`WhA?y(q<f`<c6%@dUw9u~aA?j^`pueW
zW?_3n{u)d4@AQyf;UHiIRxp16RoWg&F+uwIJYB{!Spu!Z6TFEXau<!8UfawC4vbZv
zJTpZLC-RhzHO9xSd6HqYzkfjT+8e>f6~q}4D>9jic@mTO5x76ulDz@+XAcm35!VSu
zT*Gs>;f0b2TNpjU_BjHZ&S6Sqk6V1370+!eppV2H+FY!q*n=GHQ!9Rn6MjY!Jc77A
zG7Y!lFp8?TIHN!LXO?gCnsYM-gQxsm=Ek**V<u>mZu7vnuufD7K~GIxfxbsQ@qv2T
zPa`tvHB$fFCyZl>3oYg?_wW)C>^_iDOc^B7klnTOoytQH18WkOk)L2BSD0r%xgRSW
zQS9elF^?O=_@|58zKLK;(f77l-Zzu}4{fXed2saq!5k#UZAoDBqYQS{sn@j@Vtp|$
zG%gnZ$U|9@u#w1@11Sjl8ze^Co=)7yS(}=;68a3~g;NDe_X^}yJj;~s8xq9ahQ5_r
zxAlTMnep*)w1e(TG%tWsjo3RR;yVGPEO4V{Zp?=a_0R#=V^ioQu4YL=BO4r0$$XTX
zZfnw#_$V}sDAIDrezGQ+h?q24St0QNug_?{s-pI(^jg`#JRxM1YBV;a@@JQvH8*>>
zIJvku74E0NlXkYe_624>znU0J@L<-c=G#F3k4A_)*;ky!C(^uZfj%WB3-*{*B$?9+
zDm$WFp=0(xnt6`vDQV3Jl5f&R(Mp};;q8d3I%Kn>Kx=^;uSVCw0L=gw53%Bp==8Sw
zxtx=cs!^-_+i{2OK`Q;913+AXc_&Z5$@z3<)So0CU3;JAv=H?@Zpi~riQ{z-zLtVL
z!oF<}@IgJp)Iyz1zVJ42!SPHSkjYNS4%ulVVIXdRuiZ@5Mx8LJS}J#qD^Zi_xQ<pa
zK0_C<`%bp5M~CVCk7hV^j*M;Wzcj7kCsCfgg5CJ~2`y3|66=yp|GC7FJNP7A_Wc+(
zejiW#M^nRp{rUgmIRC{MB`ST%d>@>DKDr-_e#>5h3dtje*NcwH_h;i{Sx7}dkdpuW
z(yUCjckQsagv*QGMSi9u1`Z|V^}Wjf7B@q%j2DQXyd0nOyqg%m{CK_lAoKlJ7#8M}
z%IvR?Vh$6aDWK2W!=i?*<77q&B8O&3?zP(Cs@kapc)&p7En?J;t-TX9abGT#H?TW?
ztO5(lPKRuC7fs}zwcUKbRh=7E8wzTsa#Z{a`WR}?UZ%!HohN}d&xJ=JQhpO1PI#>X
zHkb>pW04pU%Bj_mf~U}1F1=wxdBZu1790>3Dm44bQ#F=T4V3&HlOLsGH)+AK$cHk6
zia$=$kog?)07HCL*PI6}DRhpM^*%I*kHM<#1Se+AQ!!xyhcy6j7`iDX7Z-2i73_n#
zas*?7LkxS<P{ZTB#tR|&N^U;Moy2#JwwW4RFPddYtD_bw0R1|Eo=5;j>-XSqv;YBa
zW_n*32D(HTYQ0$feV_Fru1ZxW0g&iwqixPX3=9t4o)o|kOo79V$?$uh?#8<F`E$a_
zutly1{7L1J@Y@6Vp*~KB!yXMF2QHqby@+ZG8+ND)X+s9is!(NOe)h&%h+bxjPFhwq
z$60~SJQ<aykcGl3;BUCZ>Q8e>4e)V6;_(x&ViUVxma+i25qea;d-oK7ouuDsB^ab{
zu1qjQ%`n56VtxBE#0qAzb7lph`Eb-}TYpXB!H-}3Ykqyp`otprp<BVE0iW(NxIg}T
zHQz%!Kt^1I=QqGS(r32Y=&DuF_0#yaLgW`I>7{VEuW*^IR2n$Fb99*nAtqT&oOFIf
z@w*6>YvOGw@Ja?Pp1=whZqydzx@9X4n^2!n83C5{C?G@|E?&$?p*g68)kNvUTJ)I6
z1Q|(#UuP6p<iHaJ>j78GUxq11m-GSszc+)X{C2eo-?8ud9sB=3(D47v?`JAa{V(IF
zPZQ_0AY*9M97>Jf<<kmA6!?J&2x7=_q{>o%#O_%Wq}8>YM=q0|tGY+hlXcpE=Z4Od
z`NT7Hu2hnvRoqOw@g1f=bv`+nba{GwA$Ak0INlqI1k<9!x_!sL()h?hEWoWrdU3w`
zZ%%)VR+Bc@_v!C#koM1p-3v_^L6)_Ktj4HE>aUh%2XZE@JFMOn)J~c`_7VWNb9c-N
z2b|SZMR4Z@E7j&q&9(6H3yjEu6HV7{2!1t0lgizD;mZ9$r(r7W5G$ky@w(T_dFnOD
z*p#+z$@pKE+>o@%eT(2-p_C}wbQ5s(%Sn_{$HDN@MB+Ev?t@3dPy`%TZ!z}AThZSu
zN<1i$s<Qod+!u+1TpqzHMAR;(P|C33h|NdU1+@toT{?QhAJAzzUDj;ch>iJhXFdjV
zP*y|V<`V8t=h#XTRUR~5`c`Z9^-`*BZf?WAehGdg)E2Je)hqFa!k{V(u+(hTf^Yq&
zoruUh2(^3pe)2{bvt4&4Y9CY3js<F>)PUHtd4rVG57}uFJL)D(JfSIo^{P=7liFXG
zq5yqgof0V8paQcP!gy+;^pp-DA5pj=gbMN0eW=-eY+N8~y+G>t+x}oa!5r>tW$xhI
zPQSv=pi;~653Gvf6~*JcQ%t1xOrH2l3Zy@8AoJ+wz@daW@m7?%LXkr!bw9GY@ns3e
zSfuWF_gkWnesv?s3I`@}NgE2xwgs&rj?k<VL?gG5MC{Nmj1vZX?3e8O$&f`#KcfCT
zD|dGfAH<9vQYUE_U}e#K2epdwK03De5{_327SI@sw~J+|<wi@;rZX!9Y2MH7_L7?E
z^an@e4GxaY9F(p#Ot=#L(YG%x=Gq!vNbxtM=IXzPyPmYSGU#`>H-FEy82=O8`+szN
ziHch`vvS`zNfap14!&#i9H@wF7}yIPm=UB%(o(}F{wsZ(wA0nJ2aD^@B41>>o-_U6
zUqD~vdo48S8~FTb^+%#zcbQiiYoDKYcj&$#^;Smmb+Ljp(L=1Kt_J!;0s%1|JK}Wi
z;={~oL!foo5n8=}rs6MmUW~R&;SIJO3TL4Ky?kh+b2rT9B1Jl4<n8E`j>>#Uh-Bec
z`Hsp<==#UEW6pGPhNk8H!!DUQR~#F9jEMI6T*OWfN^Ze&X(4nV$wa8QUJ>oTkruH#
zm~O<`J7Wxseo@FqaZMl#Y(mrFW9AHM9Kb|XBMqaZ2a)DvJgYipkDD_VUF_PKd~dT7
z#02}bBfPn9a!X!O#83=lbJSK#E}K&yx-HI#T6ua)6o0{|={*HFusCkHzs|Fn&|C3H
zBck1cmfcWVUN&i>X$YU^Sn6k2H;r3zuXbJFz)r5~3$d$tUj(l1?o={MM){kjgqXRO
zc5R*#{;V7AQh|G|)jLM@wGAK&rm2~@{Pewv#06pHbKn#wL0P6F1!^qw9g&cW3Z=9}
zj)POhOlwsh@eF=>z?#sIs*C-Nl(yU!#DaiaxhEs#iJqQ8w%(?+6lU02MYSeDkr!B-
zPjMv+on6OLXgGnAtl(ao>|X2Y8*Hb}GRW5}-IzXnoo-d0!m4Vy$GS!XOLy>3_+UGs
z2D|YcQx@M#M|}TDOetGi{9lGo9m-=0-^+nKE^*?$^uHkxZh}I{#UTQd;X!L+W@jm(
zDg@N4+lUqI92o_rNk{3P>1gxAL=&O;x)ZT=q1mk0kLlE$WeWuY<p^lk9k<q+rcOGG
zY)G$sy1c;kpqg0vV-}_XHLMzubt5&Y+X3Q{3Wa&iSOY9S8qUS1LUsYa--u3<U|kYH
zfL}q@Sl3A;lg32U^*mSX!dr5wpp#<9G)=5WC=&Cv)mW|a!muj?rXU0JHBrQ<`Qqt}
zCgYqLnoe5^we#~H-+!p+9n;rMDHiXMi5YFyOWW{wi{Tnu%1mqAov`>_$0`0jY-Kkt
zP*|m3AF}Ubd=`<>(Xg0har*_@x2YH}bn0Wk*OZz3*e5;Zc;2<o%?~}~ck-Gk+}B1o
z7H(amz-SpgFI})ialV|e<4!f0)HG}_n?GAIDeiC%vdRTJZ<WeGYT+p)vyz_FBWtO$
zz2K9gb9XN8(>uBdnl8?&XjupbkOeNZsNh6pvsq_ydmJI+*z**{<vBKXP!vVq<!p_e
z&dE_6Lim}RGRF|DDL<W_^>I{0K)-;p1~k8cpJXL$^t!-`E}=*4G^-E8>H!LjTPxSx
zcF<xkt%17w#Nu8CTO5_pkM9Mxb}Wf0;14cP1{yv5#7uIFNY2eGhq{_*hEBSbjQA$U
znAZkjT-yd=I_R8ZJ?H07yh5z>+cS`ommfKMhNSbas^<U&=a>@YbTpH1*RFrBuATUR
zt{oFWSk^$xU&kbFQ;MCX22RAN5F6eq9UfR$ut`Jw--p2YX)A*J69m^!oYfj2y7<PW
z+v>NYcH6&r+0~_sH^c^nzeN1AU4Ga7=FlR{S|Mm~MpzY0$Z+p2W(a={b-pR9EO1Rs
zB%KY|@wLcAA@)KXi!d2_Bxr<pdgvLNQgZmCJz&c*%>khDn`DT1=Dec}V!okd{$+wK
z4E{n8R*xKyci1(CnNdhf$Dp2(Jpof0-0%-38X=Dd9PQgT+w%Lshx9+loPS~MOm%ZT
zt%2B2iL_KU_ita<m0BJ>%N>xjB!<T714UjSOi9+HnppU8HTO6Xys3~>#71_3=3c}o
zgeW~^U_ZTJQ2!PqXulQd=3b=XOQhwATK$y(9$#1jOQ4}4?~l#&nek)H(04f(Sr=s|
zWv7Lu1=%WGk4FSw^;;!8&YPM)pQ<cfOEqZOX3qg|bi6Y+6raTouS~FCMLV)D&#=5p
zIeXB!ZqEvR`rjuFN1ix6GoI%|3=23*fhels&m^Kl7$Xb)d4}z-AG?ZOEDO)5!dX$;
zxo;%86BwnKIVK{n#wbp)z+DlG`Eo;!p1qmI5u}Dr3ERkBC^gA;rI=Ohq{XCvK{N9A
zv&z##C1k^|5<I!-;+MCJN#mX7#cD{4PE)89nNv$gm~ronTcb2Mq~#~4^M!^C-1#M(
z=1UT>DCY9DhU`hMty1@sq1=Tj7bFsOOBZOFlpR`W>-J$-(kezWJj;`?x-v>ev{*8V
z8p|KXJPV$HyQr1A(9LVrM47<GkZ=yfPF+Dq%S|w=MijcJ6`g68YLr!qXVkqpG~JGH
zv`>u-XpcrIyO`yWvx1pVYc&?154aneRpLqg<bQ>x)EMvRaa#|9?Wwqs2+W8n5~79G
z(}iCiLk;?enn}ew`HzhG+tu+Ru@T+K5juvZN)wY;x6HjvqD!&!)$$;1VAh~7fg0K|
zEha#aN=Yv|3^~YFH}cc38ovVb%L|g@9W6fo(JtT6$fa?zf@Ct88e}m?i)b*Jgc{fl
zExfdvw-BYDmH6>(4QMt#p0;FUIQqkhD}aH?a7)_%JtA~soqj{ppP_82yi9kaxuK>~
ze_)Zt>1?q=ZH*kF{1iq9sr*tVuy=u>Zev}!gEZx@O6-fjyu9X00gpIl-fS_pzjpqJ
z1yqBmf9NF!jaF<+YxgH6oXBdK)sH(>VZ)1siyA$P<#KDt;8NT*l_0{xit~5j1P)FN
zI8hhYK<Ctos5a2iSB6E;K{q?&ab{FphO?zxIqqf}%h}LQupBY+8nGBng;9rl>hQ)i
z37^aP<qpGiOwA$)u%NPBnx5-dJ?eaKuJ|k=(U7<z&!E=Ex{~rH$w*MsBugwy8}mlc
z@Cp`bIFkJ)nD=}RT7wXAD(8ljk?9q}6wNph<*7S`FnAlTK859*a=phG5gBZhql8^1
zqKXJBJSm<4{Im$>13B~u65?sg+_@2Kr^iWHN=U;EDSZ@2W2!5ALhGNWXnFBY%7W?1
z=HI9JzQ-pLKZDYTv<0-lt|6c-RwhxZ)mU2Os{bsX_i^@*fKUj8*aDO5pks=qn3Dv6
zwggpKLuyRCTVPwmw1r}B#AS}?X7b837UlXwp~E2|PJw2SGVueL7){Y&z!jL!XN=0i
zU^Eig`S2`{+<wl==2w=5|F9xM-02N5`F*r(R=yBH;xL${JnA$s&UywB#{_0RS^+hX
z*x9l+ftTA)(qqfU_K+4UB=w=A|0J9m!!ePjqBV0F7$&R=n`+yR@2tk2NwHhL{|&cH
z$fXb3^<M+ME~B<-d*&_qT19T_sRLwwvu;hmu)Vt+WHH@$?DQ6m2&aLvtcbK<MgTBu
zp(SQPX~F@0CgG?+)(jWG3Oc!+oEMIunjIDK0w|m6;WzWCMpr#dq3I<GeN7c@71Fqq
zf&ADs4Jz}_R;T_`?S9~(wB(9e>gU$68aRdWx?BZ{sU_f=8sn~>s~M?GU~`fH5kCc;
z8ICp+INM3(3{#k32RZdv6b9MQYdZXNuk<tNnM*Zj3!<Rmy`SQx3idGTVVW@OzSywq
z<sw@>7ed8;G?S2nT+NZBG=Tar^KFl2SvhW$bGW#kdWL-I)s_IqVnCDDM9fm8g;P;8
z7t4yZn3^*NQfx7SwmkzP$=fwdC}bafQSEF@pd&P8@H#`swGy_rz;Z?Ty5mkS%>m#%
zp_!m9e<()sfKiY(nF<1zBz&&`ZlJf6QLvLhl`_``%RW&{+O>Xhp;lwSsyRqGf=RWd
zpftiR`={2(siiPAS|p}@q=NhVc0ELprt%=fMXO3B)4ryC2LT(o=sLM7hJC!}T1@)E
zA3^J$3&1*M6Xq>03FX`R&w*NkrZE?FwU+Muut;>qNhj@bX17ZJxnOlPSZ=Zeiz~T_
zOu#yc3t6ONHB;?|r4w+pI)~KGN;HOGC)txxiUN8#m<I|Y6ES5NY<qwYy-|}EoiBmM
zzK&og-IJMpwjbL8IAA?{APd++5KnIBFcmKwZzO~h`v>exj+W(cz%9a4sx|IRG=}ia
zuEBuba3AHsV2feqw-3MvuL`I+2|`Ud4~7ZkN=JZ;L20|Oxna5vx1qbIh#k2O4$RQF
zo`tL()zxaqibg^GbB+BS5#U{@K;WWQj~GcB1zb}zJkPwH|5hZ9iH2308!>_;%msji
zJHSL~s)YHBR=Koa1mLEOHos*`gp=<n)R&-+66xg`pqUXP#0Zm{sf^MKJnR>s8KA-C
zu0aE+W!#iJ*0xqKm3A`fUGy#O+X+5W36myS>Uh2!R*s$aCU^`K&KKLCCDkejX2p=5
z%o7-fl03x`gaSNyr?3_JLv?2RLS3F*8ub>Jd@^Cc17)v8vYEK4aqo?OS@W9mt%ITJ
z9=S2%R8M){CugT@k~~0x`}Vl!svYqX=E)c_oU6o}#Hb^%G1l3BudxA{F*tbjG<UG_
z<Vo0tRnGPd)K5|7jd0}T-wQdZZG$TREj=4*<4bR;^};Y7orbT=`ITOz*@g<Y=x;1T
zesE`pc}OxPz?01Be<R|c+WT7B<35I5;~ham;Y6hY?F{PCSdwf@9qXBZn%JBWfW6@H
zxqGR{o+TW^4T&@lZ$GVT-8s0{R3MmXsMGGDxgNTZ8|S`+<Qxcf#Ei_^nsS}Yt*PB*
z5#h+HDAU+!GmbOJ#d(J8|6Rt?+U5gR1>;W_>=xV73pKY53v%>I)@D36I_@&p$h|Aw
zonQS`07z_F#@T-%@-Tb|)7;;anoD_WH>9ewFy(ZcEOM$#Y)8>qi7rCnsH9GO-_7zF
zu*C87{Df1P4TEOsnzZ@H%&lvV(3V@;Q!%+OYRp`g05PjY^gL$^$-t<lOaPfbWQ29U
ziQLZAEqJX}(BXW*YUz0#v0~iKqbJ35-`aw1m+YA~k)TQVyq!wOKDE%}gHJ%Woa=@J
z_G3Z9VfhbWs>0Y>H*CDDs?FZly*oZ&dxvsxaUWF!{em4{A>n@vpXg$dwvt@_rgmHF
z-MER`ABa8R-t_H*kv>}CzOpz;!>p^^9ztHMsHL|SRnS<-y5Z*r(_}c4=fXF`l<Xkw
zEy6TsVDdDoV_TNG(<WvkK8TcvFZ^8LA2wsz%Sl;DbLS}my*l(?>^-i}>e7v!qs_jv
zqvWhX^F=2sDNWA9c@P0?lUlr6ecrTKM%pNQ^?*Lq?p-0~?_j50xV%^(+H>sMul#Tw
zeciF*1=?a7cI(}352%>LO96pD+?9!fNyl^9v3^v&Y4L)mNGK0FN43&Xf8jUlxW1Bw
zyiu2;qW-aGNhs=zbuoxnxiwZ3{PFZM#Kw)9H@(hgX23h(`Wm~m4&TvoZoYp{plb^>
z_#?vXcxd>r7K+1HKJvhed>gtK`TAbJUazUWQY6T~t2af%#<+Veyr%7-#<!`u7=6z<
z@9(d@bkwuJK`;AXMIasQ=4>*A#@&*;@g58{i|E%6yC_InGXCOd{L0;$)z#?n7M`re
zh!kO{6=>7I?*}czyF7_frt#)s1CFJ_XE&VrDA?Dp3XbvF{qsEJgb&OLSNz_5g?HpK
z9)8rsr4JN!Af3G9!#Qn(6zaUDqLN(g2g8*M)Djap?WMK9NKlkC)E2|-g|#-rp%!Gz
zAHd%`iq|81efi93m3yTBw3g0j#;Yb2X{mhRAI?&KDmbGqou(2xiRNb^sV}%%Wu0?<
z?($L>(#BO*)^)<q1<y(%o-D>rSgyNRni$i`R4v;GhlCZ8$@e^ROX(p=2_v6Y!%^As
zu022)fHdv_-~Yu_H6WVPLpHQx!W%^6j)cBhS`O3QBW#x(eX54d&I22op(N59b*&$v
zFiSRY6r<ogkg`lB4Kzk}79-1@4WexUMjw+78PVxKkpJ}l$#uYSKRni~wAs(LFQ`S~
z)6H^=SsBl_=gtAZcCb%LqI~={V0L=Y2XD)_)<%!{eY=Ch6CuBtAu<8uk=VNIh~<**
z4}OE-7;4DiEfmH5r(r_l!C{NrR4GiA;)KB7^Z5GJe5Gi4jN{kozWVvza9J%ysu#}1
zV05Y-@;(u_e>Oc^(dgSV1<S;M;fP~XvG?H2UWU4_zajZjOwrCC*tclCC;-DiooNK0
zKT=dwZdL_ATeNsapPF;r>>a7-5C;(5S5MvKcM2Jm-LD9TGqDpP097%52V+0>Xqq!!
zq4e3vj53SE6i8J`XcQB|MZPP8j;PAOnpGnllH6#Ku~vS42xP*Nz@~y%db7Xi8s09P
z1)e%8ys6&M8D=Dt6&t`iKG_4X=!kgRQoh%Z`dc&mlOUqXk-k`jKv9@(a^2-Upw>?<
zt5*^DV~6Zedbec4NVl(<J^HBK@2E4@wD3MtdV(xxnq%Vbc}WuHo}5QftrZiThWddC
z`SqHbMkXtVXKo79381*cF)4R%emsAj0XSNL+PDEL#FejmgEu5hh1&sJBWank<?5||
zFa70^J_fIcXV;YE%H5TE-LegY#;Q{2+X5ec9PUkBc$GBoPD$f3kMxcc{GCuyqMnT@
zoFpa`18N8o#cD|?wHX2`2uRGD5<2)(v9&spIF?JOuSy<x-3h*1uo<tmXh9c!`+f(x
zJ!3;pKi`8)fAw>$2T{&b)zA@b#dUyd>`2JC0=xa_fIm8<d`*pjaOx{pV5Pe7q{O1W
zAT5c_%T(D{?pE<afMwAc3^p*w8J|rwP_Jm84rW1XYj_z!4LWkm8d^QnYM3TAJ00M}
zVy+@IpL$^~2r+Bvv<<LIU<+tp*%PbBjNTe&Z{H48g3%crRya-=a)qRylf)#ctNvw?
zlFff;8hFwWk7>{5u<t#%K`^*tgQ@!!adiD7YiKBgV_c5|kR2YSiy*0+0NhSocT-E<
zLsbh?CBfxQ!F7|i4p=%MX(y=Km3GT?bty_*CZh1*xF8v|8xjqfoni!cQE1EFLyy>m
zr-!ApXZhC8@=vC2WyxO|!@0Km)h8ep*`^he92$@YwP>VcdoS5OC^s38e#7RPsg4j+
zbVGG}WRSET&ZfrcR(x~k8n1rTP%CnfUNKUonD$P?FtNFF#cn!wEIab-;jU=B1dHK@
z(;(yAQJ`O$sMn>h;pf^8{JISW%d+@v6@CnXh9n5TXGC}?FI9i-D0OMaIg&mAg=0Kn
zNJ7oz5*ReJukD55fU<geid?Ih<u$lD<ZaBChRKMH%^g0f>sMuaP+H4tDN&V9zfqF@
zr=#ecUk9wu{0;!+gl;3Bw=Vn^)z$ahVhhw)io!na&9}LmWurLb0zubxK=UEnU*{5P
z+SP}&*(iBKSO4{alBHaY^)5Q=mZ+2OwIooJ7*Q5XJ+2|q`9#f?6myq!&oz?klihLq
z4C)$XP!BNS0G_Z1&TM>?Jk{S~{F3n83ioli=IO6f%wkvCl(RFFw~j0tb{Gv<R+ryT
zRX%I@y>XTx>*sB0McY0s&SNvj4+^h`9nJ_wM>F!Uc>X}9PifQekn0sKI2SAJP!a4h
z5cyGTuCj3ZBM^&{dRelIlT^9zcfaAuL5Y~bl!ppSf`wZbK$z#6U~rdclk``e+!qhe
z6Qspo*%<)eu6?C;Bp<^VuW6JI|Ncvyn+LlSl;Mp22Bl7ARQ0Xc24%29(<s4_N=u??
z$t{gru-VXd6Kxbxv5Bt?Z$yc1w+D)_3NsUN5!&<3)ieB&Nu@~@oJ<S*(@c3&Pewi0
z%9!QRq|vABdor|x%wu0~=1Z5hOOpQi`0!iyJhFLc128BQmq$$|x}WJU?iN}7Kl;!3
z&fH)Q*P121qPNW?v_ooy8Sm4CFA(z37NUr<`Xy>ZrdsIPw&-=yHQ7_Vle|5h>AST0
zUGX2Zk34vp?U~IHT|;$U86T+UUHl_NE4m|}>E~6q``7hccCaT^#y+?wD##Q%HwPd8
zV3x4L4|qqu`B<QDcIkg$mi~*?H~RNY_Wx4D`}Y)A)!YT!6xCZ~ZTPl7{FKBwUP;<e
zGwA>$4(LXqDJngNy-{&@aFBvVsywt@X^}iH7P%>bR?ciC$I^U-4Foa<B~H&?t+Qu`
zYwLM1`)@`@%8{2TvE*VigXq!t`qQLW_S5DOEz|2z2j3@V_m6j4f9Dj+Tf}F*MS{?9
z%5n(*$y|Ke#xMg=rzn#aJmhAj==f%8S}Mvc)f)j&X?h~FezLP0LfVpNB_5sLG5z-3
z-35pyFHV0nPmuu&M8{s3y}I4c7J41@C$_+Wqk&1f`MvOF;v`)pAYGN4MEXuFe)LY2
z=&C(w{X1B@m%`@ul3h(Gtch>`YKI^qDyGK7k%E%c_P=yzAi`YnxGA%DeNd++j3*h^
z=rn>oBd0|~lZ<6YvmkKY*ZJlJ;Im0tqgWu&E92eqt;+NYdxx`eS(4Hw_Jb5|yVvBg
z*tbdY^!AN;luEyN4VRhS@-_DC{({ziH{&Z}iGElSV~qvT>L<HMHzj7J-Kpa!p!c29
zr8L!oM5$VXc*@gwRrKuo*h${B7BEkfln@c=6oRf(*M;GdM=)m;H&XnLvs%-2b*+hr
z7OLt;wv$AG{1d5A_3iuPkRtq^Z3(lga~pLocX)JXz8xtPuFH$tb1E#7^Np@GfB#l+
z&L?_OtoPwP5<R)d%5@nIE}~}{W{Q)eZmHbhg7xXREu-ZQD=)K37HlG2J>-8G%+yEL
zX#MFOhj{InyKG=mvW-<1B@c-}x$vA(nU?>S>0*eN#!SLzQ)Ex7fvQ)S4D<8|I#N$3
zT5Ei`Z?cxBODHX8(Xp73v`IsAYC@9b;t}z0wxVuQSY1J^GRwDPN@qbM-ZF48T$GZ<
z8WU+;Pqo?{ghI-KZ-i*ydXu`Ep0Xw^McH_KE9J0S7G;x8Fe`DVG?j3Pv=0YzJ}yZR
z%2=oqHiUjvuk0~Ca>Kol4CFi0_xQT~;_F?=u+!kIDl-9g`#ZNZ9HCy17Ga1v^Jv9#
z{T4Kb1-AzUxq*MutfOWWZgD*HnFfyYg0&e9f(5tZ>krPF6{VikNeHoc{linPPt#Si
z&*g>(c54V8rT_AX!J&bNm-!umPvOR}vDai#`CX___J#=zeB*{4<&2WpaDncZsOkp*
zsg<%@@rbrMkR_ux9?LsQxzoBa1s%$BBn6<mos5IdgH=dhp$~J!{0^n6tTjg;U5Diz
zxn*0gDh2*1rZJV&&gt&fW9znj@{DqJLV4eBUx+J-d4%cpr)wD1@Vx?X)?B*tGi&A=
zu5vL>vk#{&&zUwcfzeCBJUwFYSF$08qDsB;gWQN*g!p8pxjofWbqNSZOEKOaTx@+*
zwdt5*Q47@EOZ~EZL9s?1o?A%9TJT=Ob_13yyugvPg*e&ZU(r6^k4=2+D-@n=Hv5vu
zSXG|hM(>h9^zn=eQ=$6`JO&70&2|%V5Lsx>)(%#;pcOfu>*nk_3HB_BNaH$`jM<^S
zcSftDU<V-inKL>1?nL;jy)+sfonQN}(}gUW?d_ikr*3=^{G)=tjBtEPe>TO|0ddVB
zTklrSHiW+!#26frPXQQ(YN8DG$PZo?(<o9TOVxrss2Tawf48g5GCBiDSDE3#UTp|k
z0pKVwU|jx{+EQY()yCur&pg1mIZ@qM8XqZ|+FwhjuUu?KF@3ju{(}}|suk~@HhJ@t
zQ#Y9~FZ`d#%R!jufJh!!RQMg1{%cNgteq&qcX<80AJGOV1RA*AhOQap$U?Y%h(E_|
z8KTI5C~yZDX7BVzv96*H*$a>po(QUCCf_OJC`pw*uey00%gmH!`WJkrKXj2!<hVw7
zGRWHF)G~HffGg-$(O!{Pk3Jwp$~FflQ(t@$&I&M44Aeq=eh)c85(&68I*-`m6uoK%
zdl?LKwDh&L2fOllfoB@M_mp%O$0z3aZY$k-iHXr}^=#M>#6?`T25mTu9OJp2L8z3!
z=arrL$ZqxuE{%yV)14Kd>k}j7pxZ6#$Dz8$@WV5p8kTqN<-7W)Q7Gt2{KoOPK_tZ|
zf2WG~O5@{qPI+W<4f_;reuFVdO^5`ADC1!JQE|N`s3cq@(0WB!n0uh@*c{=LAd;~}
zyGK@hbF-Oo+!nN)@i*O(`@FA#u?o=~e{`4O#5}z&=UkU*50fOrzi11D^&FOqe>wii
z?*k+2|EcUs;Gx{!@KBT~>PAwLrIDT7Th=Utu?~?np@t^gFs?zgX=D${RwOY^WGh-+
z+#4$066I<rp}NtcF5!lhZvD^1^l^-7{>Sh8eYW#FXWp~S`<*%O^ZuItL1Tyqt8#tZ
zY120E;^VG`!lZn&3sPd$RkdHpU#|w+bYV)pJC|SH9g%|5IkxVTQcBA4CL0}$&}ef@
zW^Vtj%M;;_1xxP9x#ex17&4N*{ksO*_4O}xYu(p*JkL#yr}@7b)t5X?%CY<+s5_MJ
zuiqt+N_;A(_)%lumoyRFixWa-M7qK_9s6<1X?JDa9fP!+_6u~~M$5L=ipB=7(j#f<
zZ<k8E`Rwou;l+s_T|L^56YmSu4%^`QFTOq==C$v}z48r)V)i|8HET~j$njp}R%Tx6
z6z1$DcJ|+-rK;8O+M4D2OI&h9Z=KBOt2kjcN}lMJQdr(sa_4xJa;t^b;+K^1J9dTU
z7E3oumY>34J%=bs549%~_mA(|={uZNs_0?o7;-LBP(ZRnkd{-^|2|=4vUTmtByHL8
zEph`(LSEzQj68a+`d$V<45J7cyv^#|^|%fD#si1Nx!4NW*`l*{->HEWNh6-|g>-=r
zXmQ|-i}Ku$ndUeHQ^&ieT!Lf}vf6GaqW9$DJ2NWrqwPY%%4nip$@vK$nRp*_C-v<|
zuKz~ZyN&<%!NS26&x?jhy+@awJipMQ-8(X4#Ae5??U<1QMt1l9R=w9fAnEF}NYu$2
z>6<Remz*?TALWUbsLm^V8JpOI8Qrv^I{Uf`UiQ}=bwlT?m7XzzYl?d75*MLY>}Vkc
zIb*A?G*z8^IvibmBKn_u^5&T_1oey0gZS2~obf(#xk=erZGTEdQnt3DMGM+0oPwss
zj5zXD;(oWhB_T@~Ig#9@v)AKtXu3>Inmgf@A|-lD-1U>cNyl3h?ADD9)GG4}zUGPk
zZzaXe!~Kf?<~@$G?Uql3t8jy9{2!doq4=J}j9ktTxss{p6!9Ud<L0{JvuM8zT}59}
zy?q<PQ`(o7I;|2vYuA@mlwTC2VWpG0`?f+&<cL+^MokC9tZR;`s@;M`H_iw&h5ar5
zCVhp4@v?m@Lrzz?2Z>jyDERlA*xZ!=Q)KDs5O)phz>Vq3BNGoM(H|=1*Q4$^2fTZw
z(%nq1P|5Rt81}SYJpEEzMPl5VJsV5&4e)ZWKDyo<HF@#^TWD2$%#hJqrPwhn_I`~h
zrl9TX+98FE%rR>Z>1EwpkHx-AQVQc8%JMz;{H~p{=FXV>jIxvm4X*qv52e?Y-f%DJ
zxEA165GikEASQ^fH6K#d!Tpu2HP{sFs%E=e$gYd$aj$+xue6N+Wc(rAz~wUsk2`(b
z8Kvmyz%bKQxpP}~baG-rwYcYCv<u@FIev&XxIE!%Ztbxbt-Y%ng;He-ByR+Lk{8!A
z+z_?%ukrIgzuh|^o09LLIM$E(jD2g{U~B*3O~s^?p1O(7xg6;i3B@l87#4eO$$oiS
znHnuA9vGDnw~@NpSw1YYZjoo+HWTzC%$-J_E}3wK-(9hvP0Ct}242U82Wv+8>kHOi
zlkR<=>ZBTU*8RF_d#Bl@zZsRIhx<%~Z@Z<pYsoy05ZE@HkOT*OyM_H}bO0QHT>=ik
z>adw3!DK(8R|q$vy{FTxw%#xliD~6qXmY^7_9kthVPTF~Xy1CfBqbU~?1QmxmU=+k
z(ggxvEuA;0e&+ci-zQR{-f7aO{O(Pz_OsEjLh_K>MbvoZ4nxtk5u{g@nPv)cgW_R}
z9}EA4K4@z0?7ue}Z(o~R(X&FjejUI2g~08PH1E4w>9o{)S(?1>Z0XMvTb|;&EuyOE
z<Uf?|k(eh=|5A0k%+DyPOynyZIUEys#os=5y;pE3-eqjU+vfn|kx^`PvS0aFgK5k7
z_Ws8b<tF{v{r8Xc%Ra(S^vGriKjcqYF_IFKBCZ&>GvWNpYX)Nv<8|a^;<iw`qCF!m
zJCer*R!Lka?@SE&ch5f_idG-0J0#H0t`-WqI6fU-<8)0T>1>bh#&znEcl-r!T#pn=
z4$?Yudha6F%4b>*8@=BdtXXY4N+`U4Dmx$}>HeVJk-QdTG@t!tVT#0(LeV0gvqyyw
z2sEp^9eY0N`u10Tm4n8No&A=)IeEC|gnmEXoNSzu!1<4R<%-9kY_8~5Ej?zReg<ow
z_S`;&)5*<v9D*b@Pp375jJ+bUt(~O-SF0ZSb>Mn78wuMs#;i&eUA0Zk_RXQ3b&TT}
z;SCI<r<Tr9OC%fOR>=7-FUB@*&;8|n>(_g^HGf3@QODE3LpmX~ELnymQm{Sx9xrKS
zK29p~?v@R$0=v6Dr5aW>-!{+h@?Q58|Kz8{{W`%J+lDAdb&M5VHrX_m<yIVSusgG)
z`wE|gu8*tGFN+9LZP$c*%%oTIyT7Y#Uu$+WtVYvbCQ+Do^ID=OntZUS+AP2A;pDLp
zQ|q{WRobah5mlF}4!P`?SD*ypcWDw;D;8QgDGN?GxP&*+636N@_L%TF<fCxBbhCW%
zM%&xYBGf(}%YX|$hc1=>DY;1-JLnf)ezmPau$)1;=`-FU=-r-83tX=C`S#}GZufju
zQ>sXNT0Ny=k@nc%cFnvA_i4SC)?_ORXHq8B4D%el1uPX`c~uG#S1M7C+*MMqLw78E
zhY2dI8@+N^qrM<pEkuRyo-$3h*azHIX|CuJeg6U1uTYr#f1L)jm7pty!Nt_EpVqGj
zXVf`s*T3XSv0ipIVh~5QdLCYp1wi6^@nTR33ob<X9tfmOm*uFzZ-27JIbc;RY)sW`
zt!7sXsMOap1pmxDHwCgCeJ^w>I1+;TUd<Y7V_}N3cTlx9g%Oxl!1(@Fz@fhB!X!Qg
zzGte&a;gDDb7s`2pRO@2w9ly`vje6Ilj<$_c7{OhbUJ}K%$c%`!Wu9kSA#`Xja47q
z=~M~uTQ&ejWO4`vDE9|h6MO?f*d<f1pCzFOx6YSGxl|H(I2h0=%!FyILZ_|=bX<Z@
zIXy76kKLDpG=TR2T<cl5;7+F|gWhKYZ346vNMyq9`PzDJfsBPN=&eq$MKd*arcmkB
zATW410h$o^5JE{IECTR`<Q+_;Q1BqjI5Ciw%m8kGTdmE2GB82|h<4gEtNm!8e#?mt
z;D<+GLqmLk>a(vGqGSRyU{Fnm`aqrr7bz4<dKN=G5TxM1z~0~yR6MN!z}y8FQ<9oh
z=+qK`&H;`)dq`^^SX>2c5xsOO-~oZpkzorD1g}Y<6rk&3>PsSGy}W?MtqFky@A(X#
zIuNZK0cK?^=;PUAu>j0#HtjbHCV*6?jzA&OoE$*Jlga*}LF`SF?WLhv1O|zqC<>*>
zYB;#lsYKx0&kH@BFpW8n*yDcc6?;_zaJs<-jPSkCsSX-!aV=P5kUgF@Nu<{a%#K*F
z134Q{9|YX7X(v$62_cY3^G%t~rD>Q0z@)1|zs)vjJ6Jq9;7#Ki`w+eS**En?7;n&7
zu==V3T&eFboN3ZiMx3D8qYc;VjFUk_H-WWCau(VFXSQf~viH0L$<p$oTrv;PAp(HR
zxGYwoQ{`RRg4u%PEm#VL7~lyq<6`h+F9OK~tPR1wo}q|fJ3M>gwD$<h61N|-us1_A
zl+PEzjP+uR3A<ynvirg<hO=WL=vh8&(Wl2haO~h*nJ_e*eGq{@?#~H&b}m8~6;8Z^
zpbmt5cVjrE;ei2}0ANsjD@<R3Rp?ZF=p4ZLJsD20%)C2Oh4~(z31DYTGiF#ITp7;7
z!>UfFHqNcgN`x}M+YQ6RnN<+@t>JUp#)9YOkqst-Ga?{FsDpEeX0(5v{0J~SEbWiL
zXC2}M4?UH@u&|;%0y`eb33ldo4~z-x8zY!oVmV=c+f$m?RfDC35mdQ2E>Pze7KWP-
z>!Bh<&57I+O_^s}9Tg^k)h7{xx@0a0IA~GAOt2yy!X%Q$1rt~LbTB6@Du!_0%HV>N
zlf)QI1&gvERKwso23mJ!Ou6ZS#zCS5W`gxE5T>C#E|{i<1D35C222I33?Njaz`On7
zi<+VWFP6D{e-{yiN#M|Jgk<44u1TiMI78S5W`Sdb5f+{zu34s{CfWN7a3Cf^@L%!&
zN$?|!!9j2c)j$~+R6n#891w-z8(!oBpL2K=+%a$r2|~8-(vQj5_XT`<0Ksf;oP+tz
z9CObS!0m)Tgg`K#xBM8B(|Z)Wb&DYL{WTYv`;A=q6~Nnx2+!lTIXtj8J7dZE!P_{z
z#f8w6F}^!?^KE#+ZDv+xd5O&3EmomZzsv?>E-~ygGum45fk!SBN&|eo1rKw^?aZJ4
E2O(~oYXATM

literal 0
HcmV?d00001

diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 000000000..a3958c140
--- /dev/null
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Dec 21 14:31:09 AEDT 2023
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/examples/llama.android/gradlew b/examples/llama.android/gradlew
new file mode 100755
index 000000000..4f906e0c8
--- /dev/null
+++ b/examples/llama.android/gradlew
@@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
new file mode 100644
index 000000000..2ba32c4fa
--- /dev/null
+++ b/examples/llama.android/settings.gradle.kts
@@ -0,0 +1,17 @@
+pluginManagement {
+    repositories {
+        google()
+        mavenCentral()
+        gradlePluginPortal()
+    }
+}
+dependencyResolutionManagement {
+    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
+    repositories {
+        google()
+        mavenCentral()
+    }
+}
+
+rootProject.name = "LlamaAndroid"
+include(":app")

From 158f8c9e21302114bac3c646f80ea85b52ffa0bd Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Tue, 16 Jan 2024 17:05:19 +0000
Subject: [PATCH 044/138] metal : localized logic in `ggml_metal_graph_compute`
 (#4924)

* Metal: Localized logic in `ggml_metal_graph_compute`, minor performance improvement

* Whitespace

* Collecting command buffer completions on single thread

* Whitespace

* Reduce diff noise
---
 ggml-metal.h |  1 -
 ggml-metal.m | 37 +++++++++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/ggml-metal.h b/ggml-metal.h
index 8b0bfc5f1..df83a1807 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -27,7 +27,6 @@
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
-#define GGML_METAL_MAX_COMMAND_BUFFERS 32
 
 struct ggml_tensor;
 struct ggml_cgraph;
diff --git a/ggml-metal.m b/ggml-metal.m
index c21dc465a..a549e6713 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -170,9 +170,6 @@ struct ggml_metal_context {
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
-    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
-    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
-
     dispatch_queue_t d_queue;
 
     int n_buffers;
@@ -719,25 +716,25 @@ static bool ggml_metal_graph_compute(
     @autoreleasepool {
 
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
-
-    const int n_nodes  = gf->n_nodes;
     edesc.dispatchType = MTLDispatchTypeSerial;
 
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
+    const int n_nodes  = gf->n_nodes;
     const int n_cb = ctx->n_cb;
+    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
 
-    for (int i = 0; i < n_cb; ++i) {
-        ctx->command_buffers[i] = [ctx->queue commandBuffer];
+    id<MTLCommandBuffer> command_buffer_builder[n_cb];
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
+        command_buffer_builder[cb_idx] = command_buffer;
 
         // enqueue the command buffers in order to specify their execution order
-        [ctx->command_buffers[i] enqueue];
-
-        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+        [command_buffer enqueue];
     }
+    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
 
-    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
     dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
         const int cb_idx = iter;
 
@@ -745,15 +742,13 @@ static bool ggml_metal_graph_compute(
         size_t offs_src1 = 0;
         size_t offs_dst  = 0;
 
-        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
+        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
 
         const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
         const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
 
-        for (int ind = node_start; ind < node_end; ++ind) {
-            const int i = ind;
-
+        for (int i = node_start; i < node_end; ++i) {
             if (i == -1) {
                 [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                 continue;
@@ -2249,12 +2244,14 @@ static bool ggml_metal_graph_compute(
         [command_buffer commit];
     });
 
-    // check status of command buffers
+    // Wait for completion and check status of each command buffer
     // needed to detect if the device ran out-of-memory for example (#1881)
-    for (int i = 0; i < n_cb; i++) {
-        [ctx->command_buffers[i] waitUntilCompleted];
 
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
+    for (int i = 0; i < n_cb; ++i) {
+        id<MTLCommandBuffer> command_buffer = command_buffers[i];
+        [command_buffer waitUntilCompleted];
+
+        MTLCommandBufferStatus status = [command_buffer status];
         if (status != MTLCommandBufferStatusCompleted) {
             GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
             return false;

From c37b3474e61d609d43cccc3bde5d559e80e4f5d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 19:13:54 +0200
Subject: [PATCH 045/138] flake.lock: update flake-parts,
 flake-parts/nixpkgs-lib, and nixpkgs (#4920)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'flake-parts':
    'github:hercules-ci/flake-parts/34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5' (2023-12-01)
  → 'github:hercules-ci/flake-parts/07f6395285469419cf9d078f59b5b49993198c00' (2024-01-11)
• Updated input 'flake-parts/nixpkgs-lib':
    'github:NixOS/nixpkgs/e92039b55bcd58469325ded85d4f58dd5a4eaf58?dir=lib' (2023-11-29)
  → 'github:NixOS/nixpkgs/b0d36bd0a420ecee3bc916c91886caca87c894e9?dir=lib' (2023-12-30)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/cfc3698c31b1fb9cdcf10f36c9643460264d0ca8' (2023-12-27)
  → 'github:NixOS/nixpkgs/317484b1ead87b9c1b8ac5261a8d2dd748a0492d' (2024-01-08)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/flake.lock b/flake.lock
index 15a0a1a8e..cd532ef4f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1701473968,
-        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
+        "lastModified": 1704982712,
+        "narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
+        "rev": "07f6395285469419cf9d078f59b5b49993198c00",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1703637592,
-        "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
+        "lastModified": 1705133751,
+        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
+        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
         "type": "github"
       },
       "original": {
@@ -37,11 +37,11 @@
     "nixpkgs-lib": {
       "locked": {
         "dir": "lib",
-        "lastModified": 1701253981,
-        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
+        "lastModified": 1703961334,
+        "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
+        "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
         "type": "github"
       },
       "original": {

From 959ef0c0df725c013c7f712eaa7790b8e38a8e20 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 19:34:54 +0200
Subject: [PATCH 046/138] perplexity : fix kv cache handling for hellaswag
 (#4981)

ggml-ci
---
 examples/perplexity/perplexity.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9a77beca6..b4fedf803 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
     for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
         size_t n_tokens = tokens.size() - i_chunk * n_batch;
         n_tokens = std::min(n_tokens, size_t(n_batch));
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
         if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return {};

From 4feb4b33eeb1756e46084a4db9230b279af1a480 Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Tue, 16 Jan 2024 18:41:42 +0100
Subject: [PATCH 047/138] examples : add complete parallel function calling
 example (#4974)

---
 .../pydantic-models-to-grammar-examples.py    | 123 +++++++++++++++++-
 1 file changed, 121 insertions(+), 2 deletions(-)

diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py
index a8a4919cf..cbf376652 100644
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -1,5 +1,5 @@
 # Function calling example using pydantic models.
-
+import datetime
 import json
 from enum import Enum
 from typing import Union, Optional
@@ -8,7 +8,8 @@ import requests
 from pydantic import BaseModel, Field
 
 import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
+from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
+
 
 # Function to get completion on the llama.cpp server with grammar.
 def create_completion(prompt, grammar):
@@ -134,3 +135,121 @@ text = create_completion(prompt=prompt, grammar=gbnf_grammar)
 json_data = json.loads(text)
 
 print(Book(**json_data))
+# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
+
+def get_current_datetime(output_format: Optional[str] = None):
+    """
+    Get the current date and time in the given format.
+    Args:
+         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
+    """
+    if output_format is None:
+        output_format = '%Y-%m-%d %H:%M:%S'
+    return datetime.datetime.now().strftime(output_format)
+
+
+# Enum for the calculator tool.
+class MathOperation(Enum):
+    ADD = "add"
+    SUBTRACT = "subtract"
+    MULTIPLY = "multiply"
+    DIVIDE = "divide"
+
+
+
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
+class Calculator(BaseModel):
+    """
+    Perform a math operation on two numbers.
+    """
+    number_one: Union[int, float] = Field(..., description="First number.")
+    operation: MathOperation = Field(..., description="Math operation to perform.")
+    number_two: Union[int, float] = Field(..., description="Second number.")
+
+    def run(self):
+        if self.operation == MathOperation.ADD:
+            return self.number_one + self.number_two
+        elif self.operation == MathOperation.SUBTRACT:
+            return self.number_one - self.number_two
+        elif self.operation == MathOperation.MULTIPLY:
+            return self.number_one * self.number_two
+        elif self.operation == MathOperation.DIVIDE:
+            return self.number_one / self.number_two
+        else:
+            raise ValueError("Unknown operation.")
+
+
+# Example function to get the weather
+def get_current_weather(location, unit):
+    """Get the current weather in a given location"""
+    if "London" in location:
+        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
+    elif "New York" in location:
+        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
+    elif "North Pole" in location:
+        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+# Here is a function definition in OpenAI style
+current_weather_tool = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+            },
+            "required": ["location"],
+        },
+    },
+}
+
+# Convert OpenAI function definition into pydantic model
+current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+# Add the actual function to a pydantic model
+current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
+
+# Convert normal Python function to a pydantic model
+current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+
+tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+
+
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+    pydantic_model_list=tool_list, outer_object_name="function",
+    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
+
+system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+
+
+text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+
+json_data = json.loads(text)
+
+print(json_data)
+# Should output something like this:
+# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
+
+
+for call in json_data:
+    if call["function"] == "Calculator":
+        print(Calculator(**call["params"]).run())
+    elif call["function"] == "get_current_datetime":
+        print(current_datetime_model(**call["params"]).run())
+    elif call["function"] == "get_current_weather":
+        print(current_weather_tool_model(**call["params"]).run())
+# Should output something like this:
+# 2024-01-14 13:36:06
+# {"location": "London", "temperature": "42", "unit": "celsius"}
+# 1764

From 334a835a1ccc8106a5fa355683a965efb1bfa24b Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Tue, 16 Jan 2024 19:51:26 +0200
Subject: [PATCH 048/138] ggml : importance matrix support for legacy quants
 (#4969)

* imatrix: adding support for legacy quants

* imatrix: guard Q4_0/Q5_0 against ffn_down craziness

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml-quants.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++
 ggml-quants.h |   4 ++
 ggml.c        |  28 +++++---
 llama.cpp     |  10 +++
 4 files changed, 226 insertions(+), 8 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 0750fe1bb..31b053e33 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
+
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
     const int qk = QK4_1;
 
@@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
     return nrow * row_size;
 }
 
+static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_0_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_0];
+    int8_t L[QK4_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK4_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_0 * ib;
+        const float * qw = quant_weights + QK4_0 * ib;
+        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK4_1 == 32, "QK4_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_1_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_1];
+    uint8_t L[QK4_1], Laux[QK4_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK4_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_1 * ib;
+        const float * qw = quant_weights + QK4_1 * ib;
+        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK5_0 == 32, "QK5_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_0_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_0];
+    int8_t L[QK5_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK5_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_0 * ib;
+        const float * qw = quant_weights + QK5_0 * ib;
+        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK5_1 == 32, "QK5_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_1_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_1];
+    uint8_t L[QK5_1], Laux[QK5_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK5_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_1 * ib;
+        const float * qw = quant_weights + QK5_1 * ib;
+        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+
+        uint32_t qh = 0;
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 static const  uint64_t iq2xxs_grid[256] = {
diff --git a/ggml-quants.h b/ggml-quants.h
index 99467936a..d7fefdb54 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -253,3 +253,7 @@ size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
diff --git a/ggml.c b/ggml.c
index 5779f32d2..d7e01b81f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
         case GGML_TYPE_Q4_0:
             {
                 GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q4_1:
             {
                 GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q5_0:
             {
                 GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q5_1:
             {
                 GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q8_0:
             {
diff --git a/llama.cpp b/llama.cpp
index 46c4d11c8..765d20ddb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8374,6 +8374,8 @@ struct quantize_state_internal {
     int n_k_quantized     = 0;
     int n_fallback        = 0;
 
+    bool has_imatrix      = false;
+
     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
         : model(model)
         , params(params)
@@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
             new_type = GGML_TYPE_Q5_K;
         }
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+                && qs.has_imatrix && i_layer < n_layer/8) {
+            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+        }
         ++qs.i_feed_forward_w2;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
@@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
             LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            qs.has_imatrix = true;
         }
     }
 

From cec8a4847062fbd76253e3b085683f39d91e80d3 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Jan 2024 18:54:24 +0100
Subject: [PATCH 049/138] finetune : add training data file to log message
 (#4979)

This commit adds the name of the training data file to the log message
printed when the training data is tokenized.

The motivation for this change is that it can be useful to show which
file is being tokenized when running the finetune example.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/finetune/finetune.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index a6620fd73..11fcbf443 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1799,7 +1799,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;
     std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
+    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
     tokenize_file(lctx,
             params.common.fn_train_data,
             params.common.sample_start,

From bee938da74c33f42242c3a1058ac0a0a6eeef531 Mon Sep 17 00:00:00 2001
From: Philip Taron <philip.taron@gmail.com>
Date: Tue, 16 Jan 2024 09:56:21 -0800
Subject: [PATCH 050/138] nix: remove nixConfig from flake.nix (#4984)

---
 flake.nix | 57 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/flake.nix b/flake.nix
index 488ed6c59..ec62c773a 100644
--- a/flake.nix
+++ b/flake.nix
@@ -6,28 +6,41 @@
     flake-parts.url = "github:hercules-ci/flake-parts";
   };
 
-  # Optional binary cache
-  nixConfig = {
-    extra-substituters = [
-      # Populated by the CI in ggerganov/llama.cpp
-      "https://llama-cpp.cachix.org"
-
-      # A development cache for nixpkgs imported with `config.cudaSupport = true`.
-      # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
-      # This lets one skip building e.g. the CUDA-enabled openmpi.
-      # TODO: Replace once nix-community obtains an official one.
-      "https://cuda-maintainers.cachix.org"
-    ];
-
-    # Verify these are the same keys as published on
-    # - https://app.cachix.org/cache/llama-cpp
-    # - https://app.cachix.org/cache/cuda-maintainers
-    extra-trusted-public-keys = [
-      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
-      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
-    ];
-  };
-
+  # There's an optional binary cache available. The details are below, but they're commented out.
+  #
+  # Why? The terrible experience of being prompted to accept them on every single Nix command run.
+  # Plus, there are warnings shown about not being a trusted user on a default Nix install
+  # if you *do* say yes to the prompts.
+  #
+  # This experience makes having `nixConfig` in a flake a persistent UX problem.
+  #
+  # To make use of the binary cache, please add the relevant settings to your `nix.conf`.
+  # It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
+  # option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
+  # as shown below.
+  #
+  # ```
+  # nixConfig = {
+  #   extra-substituters = [
+  #     # Populated by the CI in ggerganov/llama.cpp
+  #     "https://llama-cpp.cachix.org"
+  #
+  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
+  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
+  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
+  #     # TODO: Replace once nix-community obtains an official one.
+  #     "https://cuda-maintainers.cachix.org"
+  #   ];
+  #
+  #   # Verify these are the same keys as published on
+  #   # - https://app.cachix.org/cache/llama-cpp
+  #   # - https://app.cachix.org/cache/cuda-maintainers
+  #   extra-trusted-public-keys = [
+  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
+  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
+  #   ];
+  # };
+  # ```
 
   # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
   #

From 5c999609013a30c06e6fd28be8db5c2074bcc196 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 20:59:31 +0200
Subject: [PATCH 051/138] py : remove unnecessary hasattr (#4903)

---
 convert-hf-to-gguf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index b133f3b49..1178d63a2 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -266,11 +266,10 @@ class Model:
                 toktypes.append(gguf.TokenType.USER_DEFINED)
             elif reverse_vocab[i] in added_vocab:
                 tokens.append(reverse_vocab[i])
-                if hasattr(tokenizer, "added_tokens_decoder"):
-                    if tokenizer.added_tokens_decoder[i].special:
-                        toktypes.append(gguf.TokenType.CONTROL)
-                    else:
-                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                if tokenizer.added_tokens_decoder[i].special:
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.USER_DEFINED)
             else:
                 tokens.append(reverse_vocab[i])
                 toktypes.append(gguf.TokenType.NORMAL)

From f46c0c1b0ea0bc67e24e4bf026a7e898c1af22a9 Mon Sep 17 00:00:00 2001
From: David Renshaw <dwrenshaw@gmail.com>
Date: Wed, 17 Jan 2024 02:17:50 -0500
Subject: [PATCH 052/138] llama : fix copy/paste error in llama_sampling_params
 comment (#4994)

---
 common/sampling.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/sampling.h b/common/sampling.h
index f16ef97e3..2ee180376 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -17,7 +17,7 @@ typedef struct llama_sampling_params {
     float       min_p                 = 0.05f;    // 0.0 = disabled
     float       tfs_z                 = 1.00f;    // 1.0 = disabled
     float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
     int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float       penalty_repeat        = 1.10f;    // 1.0 = disabled
     float       penalty_freq          = 0.00f;    // 0.0 = disabled

From 75632936659772d5b2ce54b0b65319fecbaac2e6 Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Wed, 17 Jan 2024 08:07:24 +0000
Subject: [PATCH 053/138] metal : remove unnecessary nil check (#4986)

---
 ggml-metal.m | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index a549e6713..8bb4edd64 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2236,10 +2236,7 @@ static bool ggml_metal_graph_compute(
 #endif
         }
 
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
+        [encoder endEncoding];
 
         [command_buffer commit];
     });

From 2b3a665d3917edf393761a24c4835447894df74a Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Wed, 17 Jan 2024 12:36:37 +0200
Subject: [PATCH 054/138] llama : use Q4_K for attn_v for Q2_K_S when n_gqa >=
 4 (#4996)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 765d20ddb..2c5983c67 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8477,7 +8477,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         }
         else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
     } else if (name.find("attn_v.weight") != std::string::npos) {
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q4_K;
+        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }

From 4f4bf35f46600441dec2f941e667291eeb9a18d8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 15:45:03 +0200
Subject: [PATCH 055/138] py : fix missing added_tokens_dict for SPM and BPE
 vocabs (#4971)

* py : fix missing added_tokens_dict for SPM vocab

* py : pad with unknown tokens when data is missing

ggml-ci

* py : fix BPE vocab conversion

ggml-ci

* py : fix padded dummy tokens (I hope)
---
 convert.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/convert.py b/convert.py
index 3b613eefc..b47bb6185 100755
--- a/convert.py
+++ b/convert.py
@@ -387,6 +387,7 @@ class BpeVocab:  # GPT
         self.bpe_tokenizer = json.loads(
             open(str(fname_tokenizer), encoding="utf-8").read()
         )
+        self.vocab = self.bpe_tokenizer["model"]["vocab"]
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
             # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
@@ -405,7 +406,7 @@ class BpeVocab:  # GPT
                     if item["content"] not in self.bpe_tokenizer
                 )
 
-        vocab_size: int = len(self.bpe_tokenizer)
+        vocab_size: int = len(self.vocab)
         expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
         actual_ids = sorted(added_tokens.values())
         if expected_ids != actual_ids:
@@ -415,6 +416,7 @@ class BpeVocab:  # GPT
             )
 
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict = added_tokens
         self.added_tokens_list = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
@@ -422,10 +424,9 @@ class BpeVocab:  # GPT
         self.fname_added_tokens = fname_added_tokens
 
     def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.bpe_tokenizer
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
 
-        for i, _ in enumerate(tokenizer):
+        for i, _ in enumerate(self.vocab):
             yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -466,6 +467,7 @@ class SentencePieceVocab:  # LlaMa
             )
 
         # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict = added_tokens
         self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
         self.vocab_size_base = vocab_size
         self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
@@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
         )
         for i in range(1, pad_count + 1):
             vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
+            vocab.added_tokens_list.append(f"<dummy{i:05}>")
         vocab.vocab_size = params.n_vocab
         return
 
@@ -1097,6 +1100,8 @@ class OutputFile:
             scores.append(score)
             toktypes.append(toktype)
 
+        assert(len(tokens) == vocab.vocab_size)
+
         return tokens, scores, toktypes
 
     def add_meta_vocab(self, vocab: Vocab) -> None:
@@ -1373,15 +1378,14 @@ class VocabFactory:
                 self.files[file] = file_path
             elif parent_file_path.exists():
                 self.files[file] = parent_file_path
+        print(f"Found vocab files: {self.files}")
 
     def _select_file(self, vocabtype: Optional[str]) -> Path:
         if vocabtype in ["spm", "bpe"]:
-            # For SentencePiece and BPE, return specific files as before
-            file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
-            if self.files[file_key]:
-                return self.files[file_key]
-            else:
-                raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
+            for file_key in self.files.keys():
+                if self.files[file_key]:
+                    return self.files[file_key]
+            raise FileNotFoundError(f"{vocabtype} vocab not found.")
         elif vocabtype == "hfft":
             # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
             return self.path

From 0f83e727af0a7cadf90b7ecc1f8e35de1d0880bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:37:36 +0200
Subject: [PATCH 056/138] py : fix whitespace

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index b47bb6185..e38ee5315 100755
--- a/convert.py
+++ b/convert.py
@@ -1100,7 +1100,7 @@ class OutputFile:
             scores.append(score)
             toktypes.append(toktype)
 
-        assert(len(tokens) == vocab.vocab_size)
+        assert len(tokens) == vocab.vocab_size
 
         return tokens, scores, toktypes
 

From c918fe8dca8fa1c4602427e0a4b88e20046f6c34 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:38:39 +0200
Subject: [PATCH 057/138] metal : create autorelease pool during library build
 (#4970)

* metal : create autorelease pool during library build

ggml-ci

* test : simplify

ggml-ci
---
 .gitignore                 |  1 +
 Makefile                   |  5 ++++-
 ci/run.sh                  |  2 ++
 ggml-metal.m               | 19 +++++++++----------
 tests/CMakeLists.txt       |  1 +
 tests/test-autorelease.cpp | 28 ++++++++++++++++++++++++++++
 6 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 tests/test-autorelease.cpp

diff --git a/.gitignore b/.gitignore
index fba207045..5ab81445d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ poetry.toml
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
 /tests/test-backend-ops
+/tests/test-autorelease
diff --git a/Makefile b/Makefile
index 995b89f7a..a8658a596 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-backend-ops tests/test-autorelease
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -747,3 +747,6 @@ tests/test-c.o: tests/test-c.c llama.h
 
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/ci/run.sh b/ci/run.sh
index 47a254f4c..86293f0db 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -179,6 +179,8 @@ function gg_run_open_llama_3b_v2 {
 
     wiki_test_60="${path_wiki}/wiki.test-60.raw"
 
+    ./bin/test-autorelease ${model_f16}
+
     ./bin/quantize ${model_f16} ${model_q8_0} q8_0
     ./bin/quantize ${model_f16} ${model_q4_0} q4_0
     ./bin/quantize ${model_f16} ${model_q4_1} q4_1
diff --git a/ggml-metal.m b/ggml-metal.m
index 8bb4edd64..66d4d675e 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -303,22 +303,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                 return NULL;
             }
 
-            // dictionary of preprocessor macros
-            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+            @autoreleasepool {
+                // dictionary of preprocessor macros
+                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 
 #ifdef GGML_QKK_64
-            prep[@"QK_K"] = @(64);
+                prep[@"QK_K"] = @(64);
 #endif
 
-            MTLCompileOptions* options = [MTLCompileOptions new];
-            options.preprocessorMacros = prep;
+                MTLCompileOptions* options = [MTLCompileOptions new];
+                options.preprocessorMacros = prep;
 
-            //[options setFastMathEnabled:false];
+                //[options setFastMathEnabled:false];
 
-            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
-
-            [options release];
-            [prep release];
+                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+            }
         }
 
         if (error) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7c932240d..d7aaab843 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -49,6 +49,7 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
 llama_build_and_test_executable(test-backend-ops.cpp)
+llama_build_and_test_executable(test-autorelease.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
 
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
new file mode 100644
index 000000000..289c6ba6c
--- /dev/null
+++ b/tests/test-autorelease.cpp
@@ -0,0 +1,28 @@
+// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
+
+#include <cstdio>
+#include <string>
+#include <thread>
+
+#include "llama.h"
+
+// This creates a new context inside a pthread and then tries to exit cleanly.
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        printf("Usage: %s model.gguf\n", argv[0]);
+        return 0; // intentionally return success
+    }
+
+    const std::string fname = argv[1];
+
+    std::thread([&fname]() {
+        llama_backend_init(false);
+        auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
+        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
+        llama_free(ctx);
+        llama_free_model(model);
+        llama_backend_free();
+    }).join();
+
+    return 0;
+}

From 44a1a4a41a4c0b03afaa7d9e06bcbc7cf95aa1e6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:39:41 +0200
Subject: [PATCH 058/138] backend : add eval callback (#4935)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* simple : no need for ggml_is_contiguous + fix bool parse

* llama : fix callback placement in llama_context_params

* backend : avoid double-ask callback calls

* simple : restore examples, imatrix will serve as a demo
---
 ggml-backend.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 ggml-backend.h | 11 +++++++++++
 llama.cpp      |  9 +++++++++
 llama.h        |  4 ++++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index f5424fb90..4266250f9 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -802,6 +802,9 @@ struct ggml_backend_sched {
     __attribute__((aligned(GGML_MEM_ALIGN)))
     #endif
     char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+
+    ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
 };
 
 #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1324,9 +1327,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         ggml_graph_dump_dot(split->graph, NULL, split_filename);
 #endif
 
+
         uint64_t compute_start_us = ggml_time_us();
-        ggml_backend_graph_compute(split_backend, &split->graph);
-        //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        if (!sched->callback_eval) {
+            ggml_backend_graph_compute(split_backend, &split->graph);
+          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        } else {
+            // similar to ggml_backend_compare_graph_backend
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct ggml_tensor * t = split->graph.nodes[j0];
+
+                // check if the user needs data from this node
+                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+
+                int j1 = j0;
+
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++j1];
+                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                }
+
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+
+                ggml_backend_graph_compute(split_backend, &gv);
+
+                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                    break;
+                }
+
+                j0 = j1;
+            }
+        }
         uint64_t compute_end_us = ggml_time_us();
         compute_us[split_backend_id] += compute_end_us - compute_start_us;
     }
@@ -1431,6 +1463,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     sched_reset(sched);
 }
 
+
+void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+
 int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
     return sched->n_splits;
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 12b4b4ab7..ab4ad773f 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -148,6 +148,14 @@ extern "C" {
     struct ggml_backend_sched;
     typedef struct ggml_backend_sched * ggml_backend_sched_t;
 
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
     // Initialize a backend scheduler
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
     GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +176,9 @@ extern "C" {
     // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
     GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
     //
     // Utils
     //
diff --git a/llama.cpp b/llama.cpp
index 2c5983c67..81829b13e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1393,6 +1393,9 @@ struct llama_cparams {
 
     bool mul_mat_q;
     bool offload_kqv;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
 };
 
 struct llama_layer {
@@ -6254,6 +6257,7 @@ static int llama_decode_internal(
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
     ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
@@ -9276,6 +9280,8 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
         /*.yarn_orig_ctx               =*/ 0,
+        /*.cb_eval                     =*/ nullptr,
+        /*.cb_eval_user_data           =*/ nullptr,
         /*.type_k                      =*/ GGML_TYPE_F16,
         /*.type_v                      =*/ GGML_TYPE_F16,
         /*.mul_mat_q                   =*/ true,
@@ -9416,6 +9422,9 @@ struct llama_context * llama_new_context_with_model(
                                hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
                                                               hparams.n_ctx_train;
 
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
     auto rope_scaling_type = params.rope_scaling_type;
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
diff --git a/llama.h b/llama.h
index a570b0d69..e268d7a1d 100644
--- a/llama.h
+++ b/llama.h
@@ -2,6 +2,7 @@
 #define LLAMA_H
 
 #include "ggml.h"
+#include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -231,6 +232,9 @@ extern "C" {
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
         enum ggml_type type_k; // data type for K cache
         enum ggml_type type_v; // data type for V cache
 

From ba69bbc84ced580fe4fdb0713ca2d95634325b7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:46:30 +0200
Subject: [PATCH 059/138] imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
---
 ci/run.sh                    |  11 ++-
 examples/imatrix/imatrix.cpp | 152 +++++++++++++++++++++++++++--------
 ggml.c                       |  14 ----
 ggml.h                       |   6 --
 4 files changed, 129 insertions(+), 54 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 86293f0db..f3a8ff774 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -216,6 +216,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -243,6 +245,8 @@ function gg_run_open_llama_3b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -284,7 +288,6 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
     compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
-
     set +e
 }
 
@@ -294,6 +297,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf 'OpenLLaMA 3B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -393,6 +397,8 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -420,6 +426,8 @@ function gg_run_open_llama_7b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -471,6 +479,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf 'OpenLLaMA 7B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1461bc963..af78711c5 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -33,43 +33,120 @@ class IMatrixCollector {
 public:
     IMatrixCollector() = default;
     void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
+    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix() const;
 private:
     std::unordered_map<std::string, Stats> m_stats;
     StatParams                             m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
+    std::vector<float>                     m_src1_data;
+    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
 };
 
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT) return false;
+        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        return true;
+    }
+
     std::lock_guard<std::mutex> lock(m_mutex);
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
     }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
-    }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
-    }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
-        }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
+
+    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
+
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        const int idx  = ((int32_t *) t->op_params)[0];
+        const int n_as = ((int32_t *) t->op_params)[1];
+
+        // the top-k selected expert ids are stored in the src0 tensor
+        // for simplicity, always copy src0 to host, because it is small
+        // take into account that src0 is not contiguous!
+        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(src0));
+        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
+        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+
+        // loop over all possible experts, regardless if they are used or not in the batch
+        // this is necessary to guarantee equal number of "ncall" for each tensor
+        for (int ex = 0; ex < n_as; ++ex) {
+            src0 = t->src[2 + ex];
+            auto& e = m_stats[src0->name];
+            if (e.values.empty()) {
+                e.values.resize(src1->ne[0], 0);
+            }
+            else if (e.values.size() != (size_t)src1->ne[0]) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+                exit(1); //GGML_ASSERT(false);
+            }
+            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+            //       using the following line, we can correct for that if needed
+            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+            ++e.ncall;
+            if (m_params.verbosity > 1) {
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            }
+            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                const int excur = m_ids[row*n_as + idx];
+                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+                if (excur != ex) continue;
+                const float * x = data + row * src1->ne[0];
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    e.values[j] += x[j]*x[j];
+                }
+            }
+            if (e.ncall > m_last_call) {
+                m_last_call = e.ncall;
+                if (m_last_call % m_params.n_output_frequency == 0) {
+                    save_imatrix();
+                }
+            }
+        }
+    } else {
+        auto& e = m_stats[src0->name];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+            exit(1); //GGML_ASSERT(false);
+        }
+        ++e.ncall;
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float * x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j]*x[j];
+            }
+        }
+        if (e.ncall > m_last_call) {
+            m_last_call = e.ncall;
+            if (m_last_call % m_params.n_output_frequency == 0) {
+                save_imatrix();
+            }
         }
     }
+
+    return true;
 }
 
 void IMatrixCollector::save_imatrix() const {
@@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
 
 static IMatrixCollector g_collector;
 
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
+static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
 }
 
 
@@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
 
     g_collector.set_parameters(std::move(sparams));
 
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
     params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
@@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
 
     llama_backend_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
+    llama_model_params mparams = llama_model_params_from_gpt_params(params);
 
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
 
+    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    cparams.cb_eval = ik_collect_imatrix;
+    cparams.cb_eval_user_data = NULL;
+
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+        return 1;
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
diff --git a/ggml.c b/ggml.c
index d7e01b81f..35fd29a9e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
 static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
 
-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
@@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
     const enum ggml_type type = src0->type;
 
     const bool src1_cont = ggml_is_contiguous(src1);
@@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
 
         const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
 
-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
         const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
diff --git a/ggml.h b/ggml.h
index 837c52e68..27daf6fd1 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2085,12 +2085,6 @@ extern "C" {
     GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
     GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
 
-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
     //
     // gguf
     //

From 38566680cdfe982a495562332c25b9227de9cf8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:54:56 +0200
Subject: [PATCH 060/138] ggml : add IQ2 to test-backend-ops + refactoring
 (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
---
 ci/run.sh                  | 12 ++++---
 ggml-backend.c             |  2 ++
 ggml-cuda.cu               | 12 +++++--
 ggml-quants.c              | 74 ++++++++++++++++----------------------
 ggml-quants.h              |  3 ++
 ggml.c                     | 34 ++++++++++++++++--
 ggml.h                     | 20 ++++++++---
 llama.cpp                  | 12 +------
 tests/test-backend-ops.cpp | 46 ++++++++++++++----------
 9 files changed, 128 insertions(+), 87 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index f3a8ff774..791b17a19 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -36,6 +36,10 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
 fi
 
+if [ ! -z ${GG_BUILD_CUDA} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+fi
+
 ## helpers
 
 # download a file if it does not exist or if it is outdated
@@ -160,8 +164,8 @@ function gg_run_open_llama_3b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../convert.py ${path_models}
 
@@ -343,8 +347,8 @@ function gg_run_open_llama_7b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../convert.py ${path_models}
 
diff --git a/ggml-backend.c b/ggml-backend.c
index 4266250f9..ef518dae0 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -692,6 +692,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
 
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     switch (op->op) {
+        case GGML_OP_CPY:
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
         default:
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 568c411af..b2211d858 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5131,10 +5131,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
 
-        const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
 
         const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
 
@@ -10918,6 +10918,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 if (a->ne[3] != b->ne[3]) {
                     return false;
                 }
+                ggml_type a_type = a->type;
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                        return false;
+                    }
+                }
                 return true;
             } break;
         case GGML_OP_GET_ROWS:
diff --git a/ggml-quants.c b/ggml-quants.c
index 31b053e33..7d2f033e9 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     }
     float sumlx = 0;
     float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
     for (int i = 0; i < n; ++i) {
+#endif
         int l = nearest_int(iscale * x[i]);
         l = MAX(-nmax, MIN(nmax-1, l));
         L[i] = l + nmax;
@@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
     float max = x[0];
     float sum_w = weights ? weights[0] : x[0]*x[0];
     float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
     for (int i = 1; i < n; ++i) {
+#endif
         if (x[i] < min) min = x[i];
         if (x[i] > max) max = x[i];
         float w = weights ? weights[i] : x[i]*x[i];
@@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
         min = 0;
     }
     if (max <= min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
+        memset(L, 0, n);
         *the_min = -min;
         return 0.f;
     }
@@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
 
 size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
 
 size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
 
 size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
 
 size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
     }
@@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 
 size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
     }
@@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
@@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
@@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
@@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
@@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
     return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
 }
 
-static void q2xs_init_impl(int grid_size) {
+void iq2xs_init_impl(int grid_size) {
     const int gindex = iq2_data_index(grid_size);
     if (iq2_data[gindex].grid) {
         return;
@@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
     free(dist2);
 }
 
-void ggml_init_iq2_quantization(enum ggml_type type) {
-    if (type == GGML_TYPE_IQ2_XXS) {
-        q2xs_init_impl(256);
-    }
-    else if (type == GGML_TYPE_IQ2_XS) {
-        q2xs_init_impl(512);
-    }
-    else {
-        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
-    }
-}
-
-static void q2xs_deinit_impl(int grid_size) {
+void iq2xs_free_impl(int grid_size) {
     GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
     const int gindex = iq2_data_index(grid_size);
     if (iq2_data[gindex].grid) {
@@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
     }
 }
 
-void ggml_deinit_iq2_quantization(enum ggml_type type) {
-    if (type == GGML_TYPE_IQ2_XXS) {
-        q2xs_deinit_impl(256);
-    }
-    else if (type == GGML_TYPE_IQ2_XS) {
-        q2xs_deinit_impl(512);
-    }
-    else {
-        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
-    }
-}
-
 static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
         const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
     int num_neighbors = neighbours[0];
@@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights);
-    GGML_ASSERT(kgrid_q2xs);
-    GGML_ASSERT(kmap_q2xs);
-    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
     const int kMaxQ = 3;
@@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights);
-    GGML_ASSERT(kmap_q2xs);
-    GGML_ASSERT(kgrid_q2xs);
-    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
     const int kMaxQ = 3;
diff --git a/ggml-quants.h b/ggml-quants.h
index d7fefdb54..7d7cf9178 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -257,3 +257,6 @@ size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+void iq2xs_init_impl(int grid_size);
+void iq2xs_free_impl(int grid_size);
diff --git a/ggml.c b/ggml.c
index 35fd29a9e..cbf2d4bdd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18524,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void ggml_quantize_init(enum ggml_type type) {
+    ggml_critical_section_start();
+
+    switch (type) {
+        case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
+        case GGML_TYPE_IQ2_XS:  iq2xs_init_impl(512); break;
+        default: // nothing
+            break;
+    }
+
+    ggml_critical_section_end();
+}
+
+void ggml_quantize_free(void) {
+    ggml_critical_section_start();
+
+    iq2xs_free_impl(256);
+    iq2xs_free_impl(512);
+
+    ggml_critical_section_end();
+}
+
 size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
@@ -18651,9 +18673,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
+bool ggml_quantize_requires_imatrix(enum ggml_type type) {
+    return
+        type == GGML_TYPE_IQ2_XXS ||
+        type == GGML_TYPE_IQ2_XS;
+}
+
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
         int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
-    (void)imatrix;
+    ggml_quantize_init(type); // this is noop if already initialized
     size_t result = 0;
     int n = nrows * n_per_row;
     switch (type) {
@@ -18766,13 +18794,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
             } break;
         case GGML_TYPE_F16:
             {
-                int elemsize = sizeof(ggml_fp16_t);
+                size_t elemsize = sizeof(ggml_fp16_t);
                 ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
         case GGML_TYPE_F32:
             {
-                int elemsize = sizeof(float);
+                size_t elemsize = sizeof(float);
                 result = n * elemsize;
                 memcpy((uint8_t *)dst + start * elemsize, src + start, result);
             } break;
diff --git a/ggml.h b/ggml.h
index 27daf6fd1..de8162b81 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2065,6 +2065,18 @@ extern "C" {
     // quantization
     //
 
+    // - ggml_quantize_init can be called multiple times with the same type
+    //   it will only initialize the quantization tables for the first call or after ggml_quantize_free
+    //   automatically called by ggml_quantize_chunk for convenience
+    //
+    // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
+    //   call this at the end of the program to avoid memory leaks
+    //
+    // note: these are thread-safe
+    //
+    GGML_API void ggml_quantize_init(enum ggml_type type);
+    GGML_API void ggml_quantize_free(void);
+
     // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2078,13 +2090,13 @@ extern "C" {
     GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
 
+    // some quantization type cannot be used without an importance matrix
+    GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
+
+    // calls ggml_quantize_init internally (i.e. can allocate memory)
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
             int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
-    // These are needed for IQ2_XS and IQ2_XXS quantizations
-    GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
-    GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
-
     //
     // gguf
     //
diff --git a/llama.cpp b/llama.cpp
index 81829b13e..d28382f7d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8747,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     ::zeros(fout, meta_size);
 
-    std::set<ggml_type> used_iq2;
-
     for (int i = 0; i < ml.n_tensors; ++i) {
         struct ggml_tensor * tensor = ml.get_tensor_meta(i);
 
@@ -8801,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             const size_t nelements = ggml_nelements(tensor);
 
-            if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
-                ggml_init_iq2_quantization(new_type);
-                used_iq2.insert(new_type);
-            }
-
             const float * imatrix = nullptr;
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
@@ -8931,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     fout.close();
 
-    for (auto type : used_iq2) {
-        ggml_deinit_iq2_quantization(type);
-    }
-
     gguf_free(ctx_out);
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9342,6 +9331,7 @@ void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_free();
 #endif
+    ggml_quantize_free();
 }
 
 int64_t llama_time_us(void) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 22a7856d4..55ce14e0d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -16,39 +16,37 @@
 #include <vector>
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    // static RNG initialization (revisit if n_threads stops being constant)
+    static const size_t n_threads = std::thread::hardware_concurrency();
+    static std::vector<std::default_random_engine> generators = []() {
+        std::random_device rd;
+        std::vector<std::default_random_engine> vec;
+        vec.reserve(n_threads);
+        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+        return vec;
+    }();
+
     size_t size = ggml_nelements(tensor);
     std::vector<float> data(size);
 
-#if 0
-    static std::default_random_engine generator(1234);
-    std::uniform_real_distribution<float> distribution(min, max);
-
-    for (size_t i = 0; i < size; i++) {
-        data[i] = distribution(generator);
-    }
-#else
-    auto init_thread = [&](size_t start, size_t end) {
-        std::random_device rd;
-        std::default_random_engine generator(rd());
+    auto init_thread = [&](size_t ith, size_t start, size_t end) {
         std::uniform_real_distribution<float> distribution(min, max);
-
         for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generator);
+            data[i] = distribution(generators[ith]);
         }
     };
 
-    size_t n_threads = std::thread::hardware_concurrency();
     std::vector<std::thread> threads;
     threads.reserve(n_threads);
     for (size_t i = 0; i < n_threads; i++) {
         size_t start =     i*size/n_threads;
         size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, start, end);
+        threads.emplace_back(init_thread, i, start, end);
     }
     for (auto & t : threads) {
         t.join();
     }
-#endif
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
         ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
@@ -56,7 +54,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
         std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
         int64_t hist[16];
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
+        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
@@ -1472,7 +1479,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         GGML_TYPE_Q8_0,
         GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
     };
 
     // unary ops
@@ -1752,6 +1760,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    ggml_quantize_free();
+
     printf("\033[1;32mOK\033[0m\n");
     return 0;
 }

From 6b6916b215251e09bd57cdbf870dc8a73345edc2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 20:54:50 +0200
Subject: [PATCH 061/138] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index be9e408fb..4d52d946b 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-b306d6e996ec0ace77118fa5098822cdc7f9c88f
+6c1ce0bd591a430c1d3f6797d905194581c878c1

From 1e605f4102c7ea8dc0dff82f5eaa6a71973d549f Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Thu, 18 Jan 2024 08:47:24 +0000
Subject: [PATCH 062/138] metal : fix memory leak, dangling pointer and unused
 autorel (#5007)

* Metal memory: Small memory leak on init, dangling pointer, and unused autorelease pool in graph compute

* SPM header potential fix

* Reverting symlinks
---
 ggml-metal.m | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 66d4d675e..912ddc83f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -238,21 +238,19 @@ static void * ggml_metal_host_malloc(size_t n) {
 static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
-    id<MTLDevice> device;
-    NSString * s;
-
-#if TARGET_OS_OSX
+#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
     // Show all the Metal device instances in the system
     NSArray * devices = MTLCopyAllDevices();
-    for (device in devices) {
-        s = [device name];
+    for (id<MTLDevice> device in devices) {
+        NSString * s = [device name];
         GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
     }
+    [devices release]; // since it was created by a *Copy* C method
 #endif
 
     // Pick and show default Metal device
-    device = MTLCreateSystemDefaultDevice();
-    s = [device name];
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    NSString * s = [device name];
     GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
 
     // Configure context
@@ -712,7 +710,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
 static bool ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
-    @autoreleasepool {
 
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
     edesc.dispatchType = MTLDispatchTypeSerial;
@@ -2255,7 +2252,6 @@ static bool ggml_metal_graph_compute(
     }
 
     return true;
-    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////

From dcad445d0c83ad49bca1b58cf9c139cfcebee5d4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 11:44:49 +0200
Subject: [PATCH 063/138] scritps : add helper script to get hellaswag data in
 txt format

---
 scripts/get-hellaswag.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100755 scripts/get-hellaswag.sh

diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh
new file mode 100755
index 000000000..ef8dcceb0
--- /dev/null
+++ b/scripts/get-hellaswag.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity --hellaswag --hellaswag-tasks N -f hellaswag_val_full.txt -m modelfile.gguf"
+echo ""
+
+exit 0

From 682986a08eb5cb04865d2e713449f17304d266d8 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Thu, 18 Jan 2024 13:46:27 +0200
Subject: [PATCH 064/138] Add Winogrande evaluation (#5015)

* winogrande: simple implementation

It doesn't look like it is working - why?
For Mistral-7B it is barely better than
random chance (score ~60% for 1267 tasks), while I see
Mistral-7B scoring 78.4% on the HF leader board.
1-sigma statistical uncertainty for 1267 tasks is ~1.4,
so no way the difference is due to statistics.

* winogrande: somewhat better

Score for Mistrali7-B is now 68.9 on the validation set of
winogrande_debiased. Still far from the reported 78.4, but
better than what I had before.

* winogrande: improving

Mistral-7B score is now 73.56.
Still not quite 78.4 but getting there.
We are also getting a lower score on HellaSwag
compared to HF leader board, so I'm not expecting
we will get up to 78.4 anyway.

It looks like it is better to skip the choice word(s)
when evaluating the average log-likelihood. This kind of
makes sense because a more common word (in Winogrande this is
often a name) will have a higher probability without knowing
about the follow up context, and this will skew the log-likelihood
towards the more common word. We can only do this if the
choice words are not last in the sentence.

It also looks like it is better to skip the punctuation at the
end of the sentence, provided the choice words are not last.

* winogrande: add dataset instructions

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.cpp                  |  10 ++
 common/common.h                    |   3 +
 examples/perplexity/perplexity.cpp | 243 ++++++++++++++++++++++++++++-
 3 files changed, 251 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2b0865fff..ce20360a4 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -681,6 +681,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.hellaswag_tasks = std::stoi(argv[i]);
+        } else if (arg == "--winogrande") {
+            params.winogrande = true;
+        } else if (arg == "--winogrande-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.winogrande_tasks = std::stoi(argv[i]);
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -926,6 +934,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
+    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
index 1f43e6282..0ae9c18b3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -105,6 +105,9 @@ struct gpt_params {
     bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
+    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b4fedf803..57eaa713e 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -9,6 +9,9 @@
 #include <thread>
 #include <mutex>
 #include <vector>
+#include <array>
+#include <fstream>
+#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -419,9 +422,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     return {tokens, ppl, logit_history, prob_history};
 }
 
-static std::vector<float> hellaswag_evaluate_tokens(
-    llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
-) {
+static std::vector<float> evaluate_tokens(llama_context * ctx, std::vector<int> & tokens,
+        int n_past, int n_batch, int n_vocab) {
     std::vector<float> result;
     result.reserve(tokens.size() * n_vocab);
     size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@@ -573,7 +575,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
-        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
+        auto logits = evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
         if (logits.empty()) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return;
@@ -622,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             //}
 
             // Evaluate the query
-            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
+            logits = evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
             if (logits.empty()) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return;
@@ -676,6 +678,235 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     printf("\n");
 }
 
+struct winogrande_entry {
+    std::string first;
+    std::string second;
+    std::array<std::string, 2> choices;
+    int answer;
+};
+
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+    std::vector<winogrande_entry> result;
+    std::istringstream in(prompt);
+    std::string line;
+    std::array<int, 4> comma_pos;
+    while (true) {
+        std::getline(in, line);
+        if (in.fail() || in.eof()) break;
+        int ipos = 0;
+        bool quote_open = false;
+        for (int i = 0; i < int(line.size()); ++i) {
+            if (!quote_open) {
+                if (line[i] == ',') {
+                    comma_pos[ipos++] = i;
+                    if (ipos == 4) break;
+                }
+                else if (line[i] == '"') {
+                    quote_open = true;
+                }
+            }
+            else {
+                if (line[i] == '"') {
+                    quote_open = false;
+                }
+            }
+        }
+        if (ipos != 4) {
+            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            continue;
+        }
+        auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
+                                                    : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
+        auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
+        auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
+        auto answer  = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
+        auto index = line.substr(0, comma_pos[0]);
+        int where = 0;
+        for ( ; where < int(sentence.size()); ++where) {
+            if (sentence[where] == '_') break;
+        }
+        if (where == int(sentence.size())) {
+            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            continue;
+        }
+        std::istringstream stream(answer.c_str());
+        int i_answer; stream >> i_answer;
+        if (stream.fail() || i_answer < 1 || i_answer > 2) {
+            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            continue;
+        }
+        result.emplace_back();
+        auto& wg = result.back();
+        wg.first = sentence.substr(0, where);
+        wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
+        wg.choices[0] = std::move(choice1);
+        wg.choices[1] = std::move(choice2);
+        wg.answer = i_answer;
+    }
+    return result;
+}
+
+/*
+ * Evaluates the Winogrande score.
+ * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
+ * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
+ * As an example, the 1st row in the above dataset is
+ *
+ *    0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
+ *
+ */
+static void winogrande_score(llama_context * ctx, const gpt_params & params) {
+
+    constexpr int k_min_trailing_ctx = 3;
+
+    auto data = load_winogrande_from_csv(params.prompt);
+    if (data.empty()) {
+        fprintf(stderr, "%s: no tasks\n", __func__);
+        return;
+    }
+
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+
+    if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
+        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        std::mt19937 rng(1);
+        std::vector<int> aux(data.size());
+        for (int i = 0; i < int(data.size()); ++i) {
+            aux[i] = i;
+        }
+        float scale = 1/(1.f + (float)rng.max());
+        std::vector<winogrande_entry> selected;
+        selected.reserve(params.winogrande_tasks);
+        for (int i = 0; i < int(params.winogrande_tasks); ++i) {
+            int j = int(scale*rng()*aux.size());
+            selected[i] = std::move(data[aux[j]]);
+            aux[j] = aux.back();
+            aux.pop_back();
+        }
+        data = std::move(selected);
+    }
+
+    // This is needed as usual for LLaMA models
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_ctx = llama_n_ctx(ctx);
+
+    std::vector<float> tok_logits(n_vocab);
+
+    int n_correct = 0;
+    int n_done    = 0;
+
+    for (size_t task_idx = 0; task_idx < data.size(); task_idx++) {
+        const auto& task = data[task_idx];
+
+        auto base_context = ::llama_tokenize(ctx, task.first, add_bos);
+        auto base_ctx_1st = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos);
+        auto base_ctx_2nd = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos);
+
+        auto sentence_1st = task.first + task.choices[0] + task.second;
+        auto sentence_2nd = task.first + task.choices[1] + task.second;
+        auto query_1st = ::llama_tokenize(ctx, sentence_1st, add_bos);
+        auto query_2nd = ::llama_tokenize(ctx, sentence_2nd, add_bos);
+
+        if (query_1st.size() > (size_t)n_ctx || query_2nd.size() > (size_t)n_ctx) {
+            fprintf(stderr, "%s : number of tokens in queries %zu, %zu > n_ctxl\n", __func__, query_1st.size(), query_2nd.size());
+            return;
+        }
+
+        auto query_1st_size = query_1st.size();
+        auto query_2nd_size = query_2nd.size();
+
+        // Speedup small evaluations by evaluating atleast 32 tokens
+        // For Winogrande this seems to slow it down rather than speed it up.
+        //if (query_1st.size() < 32) query_1st.resize(32);
+        //if (query_2nd.size() < 32) query_2nd.resize(32);
+
+        llama_kv_cache_clear(ctx);
+        auto logits_1st = evaluate_tokens(ctx, query_1st, 0, params.n_batch, n_vocab);
+
+        llama_kv_cache_clear(ctx);
+        auto logits_2nd = evaluate_tokens(ctx, query_2nd, 0, params.n_batch, n_vocab);
+
+        if (logits_1st.empty() || logits_2nd.empty()) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return;
+        }
+
+        bool skip_choice = query_1st_size - base_ctx_1st.size() > k_min_trailing_ctx &&
+                           query_2nd_size - base_ctx_2nd.size() > k_min_trailing_ctx;
+
+        float score_1st = 0;
+        bool is_nan_1st = false;
+        const auto& base_1 = skip_choice ? base_ctx_1st : base_context;
+        const int last_1st = query_1st_size - base_1.size() > 1 ? 1 : 0;
+        for (size_t j = base_1.size()-1; j < query_1st_size-1-last_1st; ++j) {
+            std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float));
+            const float prob = softmax(tok_logits)[query_1st[j+1]];
+            if (std::isnan(prob) || !prob) {
+                fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
+                        prob, j, sentence_1st.c_str(), base_context.size());
+                is_nan_1st = true;
+                break;
+            }
+            score_1st += std::log(prob);
+        }
+        score_1st /= (query_1st_size - base_1.size() - last_1st);
+
+        float score_2nd = 0;
+        bool is_nan_2nd = false;
+        const auto& base_2 = skip_choice ? base_ctx_2nd : base_context;
+        const int last_2nd = query_2nd_size - base_2.size() > 1 ? 1 : 0;
+        for (size_t j = base_2.size()-1; j < query_2nd_size-1-last_2nd; ++j) {
+            std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float));
+            const float prob = softmax(tok_logits)[query_2nd[j+1]];
+            if (std::isnan(prob) || !prob) {
+                fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
+                        prob, j, sentence_2nd.c_str(), base_context.size());
+                is_nan_2nd = true;
+                break;
+            }
+            score_2nd += std::log(prob);
+        }
+        score_2nd /= (query_2nd_size - base_2.size() - last_2nd);
+
+        if (is_nan_1st || is_nan_2nd) {
+            continue;
+        }
+
+        if (std::isnan(score_1st) || std::isnan(score_2nd)) {
+            printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd);
+            printf("Q1: <%s> - %zu tokens\n", sentence_1st.c_str(), query_1st_size);
+            printf("Q2: <%s> - %zu tokens\n", sentence_2nd.c_str(), query_2nd_size);
+            printf("B : <%s> - %zu tokens\n", task.first.c_str(), base_context.size());
+            printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", base_1.size(), base_2.size(), skip_choice);
+            continue;
+        }
+
+        int result = score_1st > score_2nd ? 1 : 2;
+
+        if (result == task.answer) {
+            ++n_correct;
+        }
+        ++n_done;
+
+        // Print the accumulated accuracy mean x 100
+        printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n",task_idx+1, 100.0 * n_correct/n_done,score_1st,score_2nd,result,task.answer);
+        fflush(stdout);
+    }
+
+    printf("\n");
+
+    if (n_done < 100) return;
+
+    const float p = 1.f*n_correct/n_done;
+    const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
+    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+}
+
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -733,6 +964,8 @@ int main(int argc, char ** argv) {
     struct results_perplexity results;
     if (params.hellaswag) {
         hellaswag_score(ctx, params);
+    } else if (params.winogrande) {
+        winogrande_score(ctx, params);
     } else {
         results = perplexity(ctx, params);
     }

From ad19812cda4062c9f154ef16315df41fbe6a770a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 15:33:01 +0200
Subject: [PATCH 065/138] perplexity : faster HellaSwag via batching (#5017)

* perplexity : faster HellaSwag

ggml-ci

* perplexity : clean-up

ggml-ci

* perplexity : no need for decode_helper

ggml-ci

* perplexity : add comments

* perplexity : option to specify max batched tasks via `n_parallel`

* perplexity : remove HellaSwag restruction for n_batch
---
 examples/perplexity/perplexity.cpp | 261 ++++++++++++++++-------------
 1 file changed, 149 insertions(+), 112 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 57eaa713e..ea2c8026c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -470,7 +470,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         prompt_lines.push_back(line);
     }
 
-    if( prompt_lines.size() % 6 != 0) {
+    if (prompt_lines.size() % 6 != 0) {
         fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
         return;
     }
@@ -485,7 +485,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
 
     // Number of tasks to use when computing the score
-    if ( params.hellaswag_tasks < hs_task_count  ) {
+    if (params.hellaswag_tasks < hs_task_count) {
         hs_task_count = params.hellaswag_tasks;
     }
 
@@ -502,27 +502,54 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         std::string ending[4];
         size_t ending_logprob_count[4];
         double ending_logprob[4];
+
+        size_t i_batch;         // starting index in the llama_batch
+        size_t common_prefix;   // max number of initial tokens that are the same in all sentences
+        size_t required_tokens; // needed number of tokens to evaluate all 4 endings
+        std::vector<llama_token> seq_tokens[4];
     };
 
     fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
 
     // Select and read data from prompt lines
-    hs_data_t *hs_data = new hs_data_t[hs_task_count];
-    for (size_t i=0; i < hs_task_count; i++) {
+    std::vector<hs_data_t> hs_data(hs_task_count);
+    for (size_t i = 0; i < hs_task_count; i++) {
         size_t idx = i;
 
+        auto & hs_cur = hs_data[i];
+
         // Select a random example of those left in the prompt
         if (randomize_tasks) {
             std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
             idx = dist(rng);
         }
 
-        hs_data[i].context = prompt_lines[idx*6];
-        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
-        for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
+        hs_cur.context = prompt_lines[idx*6];
+        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
+        for (size_t j = 0; j < 4; j++) {
+            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
+            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
         }
 
+        // determine the common prefix of the endings
+        hs_cur.common_prefix = 0;
+        hs_cur.required_tokens = 0;
+        for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
+            if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
+                hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
+                hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) {
+                break;
+            }
+            hs_cur.common_prefix++;
+        }
+        hs_cur.required_tokens = hs_cur.common_prefix +
+            hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
+
+        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
+
         // Delete the selected random example from the prompt
         if (randomize_tasks) {
             prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
@@ -530,150 +557,160 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     }
 
     fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+
     printf("\ntask\tacc_norm\n");
 
     double acc = 0.0f;
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
 
-    std::vector<std::vector<int>> ending_tokens(4);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int max_tasks_per_batch = params.n_parallel;
+    const int max_seq = 4*max_tasks_per_batch;
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
+    std::vector<float> batch_logits(n_ctx*n_vocab);
 
-    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
-        // Tokenize the context to count tokens
-        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
-        size_t context_size = context_embd.size();
+    auto decode_helper = [&](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
-        for (int i = 0; i < 4; ++i) {
-            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
-            for (int k = 0; k < int(context_size); ++k) {
-                if (ending_tokens[i][k] != context_embd[k]) {
-                    fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
-                    break;
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+                0, 0, 0, // unused
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+
+            memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
+        }
+
+        return true;
+    };
+
+    for (size_t i0 = 0; i0 < hs_task_count; i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+
+        llama_batch_clear(batch);
+
+        // batch as much tasks as possible into the available context
+        // each task has 4 unique seuqnce ids - one for each ending
+        // the common prefix is shared among the 4 sequences to save tokens
+        // we extract logits only from the last common token and from all ending tokens of each sequence
+        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
+            auto & hs_cur = hs_data[i1];
+
+            const int s0 = 4*(i1 - i0);
+            if (s0 + 4 > max_seq) {
+                break;
+            }
+
+            for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
+                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+
+            for (int s = 0; s < 4; ++s) {
+                for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
                 }
             }
+
+            hs_cur.i_batch = i_batch;
+            i_batch += hs_cur.required_tokens;
+
+            n_cur += hs_data[i1].required_tokens;
+            if (++i1 == hs_task_count) {
+                break;
+            }
         }
 
-        // Do the 1st ending
-        // In this case we include the context when evaluating
-        //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
-        auto query_embd = ending_tokens[0];
-        auto query_size = query_embd.size();
-
-        // Stop if query wont fit the ctx window
-        if (query_size > (size_t)n_ctx) {
-            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+        if (i0 == i1) {
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
-        // Speedup small evaluations by evaluating atleast 32 tokens
-        if (query_size < 32) {
-            query_embd.resize(32);
-        }
-
-        // clear the KV cache
         llama_kv_cache_clear(ctx);
 
-        auto logits = evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
-        if (logits.empty()) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, n_batch)) {
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
             return;
         }
 
-        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
-        const auto first_probs = softmax(tok_logits);
+        // compute the logprobs for each ending of the decoded tasks
+        for (size_t i = i0; i < i1; ++i) {
+            auto & hs_cur = hs_data[i];
 
-        hs_data[task_idx].ending_logprob_count[0] = 1;
-        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
 
-        // Calculate the logprobs over the ending
-        for (size_t j = context_size; j < query_size - 1; j++) {
+            const auto first_probs = softmax(tok_logits);
 
-            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+            size_t li = hs_cur.common_prefix; // logits index in the batch
 
-            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+            for (int s = 0; s < 4; ++s) {
+                hs_cur.ending_logprob_count[s] = 1;
+                hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
 
-            hs_data[task_idx].ending_logprob[0] += std::log(prob);
-            hs_data[task_idx].ending_logprob_count[0]++;
-        }
+                // Calculate the logprobs over the ending
+                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
+                    std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + li++), n_vocab*sizeof(float));
 
-        // Calculate the mean token logprob for acc_norm
-        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
+                    const float prob = softmax(tok_logits)[hs_cur.seq_tokens[s][j + 1]];
 
-        // Do the remaining endings
-        // For these, we use the bare ending with n_past = context_size
-        //
-        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
+                    hs_cur.ending_logprob[s] += std::log(prob);
+                    hs_cur.ending_logprob_count[s]++;
+                }
 
-            // Tokenize the query
-            query_embd.resize(ending_tokens[ending_idx].size() - context_size);
-            std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
-            query_size = query_embd.size();
+                // account that we skip the last token in the ending
+                ++li;
 
-            // Stop if query wont fit the ctx window
-            if (context_size + query_size > (size_t)n_ctx) {
-                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
-                return;
+                // Calculate the mean token logprob for acc_norm
+                hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
             }
 
-            // Speedup small evaluations by evaluating atleast 32 tokens
-            // No, resizing to 32 is actually slightly slower (at least on CUDA)
-            //if (query_size < 32) {
-            //    query_embd.resize(32);
-            //}
-
-            // Evaluate the query
-            logits = evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
-            if (logits.empty()) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
+            // Find the ending with maximum logprob
+            size_t ending_logprob_max_idx = 0;
+            double ending_logprob_max_val = hs_cur.ending_logprob[0];
+            for (size_t s = 1; s < 4; s++) {
+                if (hs_cur.ending_logprob[s] > ending_logprob_max_val) {
+                    ending_logprob_max_idx = s;
+                    ending_logprob_max_val =  hs_cur.ending_logprob[s];
+                }
             }
 
-            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
-            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
+            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
 
-            // Calculate the logprobs over the ending
-            for (size_t j = 0; j < query_size - 1; j++) {
-                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
-
-                const float prob = softmax(tok_logits)[query_embd[j + 1]];
-
-                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
-                hs_data[task_idx].ending_logprob_count[ending_idx]++;
+            // If the gold ending got the maximum logprobe add one accuracy point
+            if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
+                acc += 1.0;
             }
 
-            // Calculate the mean token logprob for acc_norm
-            hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
-
-
-//            printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
-//                task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
+            // Print the accumulated accuracy mean x 100
+            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            fflush(stdout);
         }
 
-        // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = 0;
-        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
-        for (size_t j = 1; j < 4; j++) {
-            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
-                ending_logprob_max_idx = j;
-                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
-            }
-        }
-
-//        printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
-
-        // If the gold ending got the maximum logprobe add one accuracy point
-        if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
-            acc += 1.0;
-        }
-
-        // Print the accumulated accuracy mean x 100
-        printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
-        fflush(stdout);
+        i0 = i1 - 1;
     }
 
-    delete [] hs_data;
+    llama_batch_free(batch);
 
     printf("\n");
 }

From 3e945cc1e9c06d2001031360e4e303e9548fb02c Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:18:21 +0200
Subject: [PATCH 066/138] HellaSwag: speed up by parallelizing log-prob
 evaluation (#5020)

For Mistral-7B and fp16, time on my system goes down from 536 seconds
to 423 seconds for the full evaluation dataset (10042 tasks).

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 80 ++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 14 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index ea2c8026c..9498dd535 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -8,6 +8,7 @@
 #include <sstream>
 #include <thread>
 #include <mutex>
+#include <atomic>
 #include <vector>
 #include <array>
 #include <fstream>
@@ -444,6 +445,48 @@ static std::vector<float> evaluate_tokens(llama_context * ctx, std::vector<int>
     return result;
 }
 
+static void hellaswag_compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
+        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
+    constexpr int k_token_chunk = 4;
+    if (eval_results.size() != eval_pairs.size()) {
+        eval_results.resize(eval_pairs.size());
+    }
+    if (eval_pairs.empty()) return;
+
+    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());
+
+    std::atomic<int> counter(0);
+    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
+        float local_logprobs[k_token_chunk];
+        while (true) {
+            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
+            if (first >= eval_results.size()) break;
+            size_t last = std::min(first + k_token_chunk, eval_results.size());
+            for (size_t i = first; i < last; ++i) {
+                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
+                float max_logit = logits[0];
+                for (int j = 1; j < n_vocab; ++j) {
+                    max_logit = std::max(max_logit, logits[j]);
+                }
+                float sum_p = 0.f;
+                for (int j = 0; j < n_vocab; ++j) {
+                    sum_p += expf(logits[j] - max_logit);
+                }
+                local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p);
+            }
+            std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float));
+        }
+    };
+
+    for (size_t it = 0; it < max_threads; ++it) {
+        workers[it] = std::thread(compute);
+    }
+    for (size_t it = 0; it < max_threads; ++it) {
+        workers[it].join();
+    }
+
+}
+
 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     // Calculates hellaswag score (acc_norm) from prompt
     //
@@ -574,6 +617,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     std::vector<float> tok_logits(n_vocab);
     std::vector<float> batch_logits(n_ctx*n_vocab);
 
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+
     auto decode_helper = [&](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
@@ -654,6 +701,24 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             return;
         }
 
+        // Compute log-probs in parallel
+        // First we collect all tasks
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto & hs_cur = hs_data[i];
+            size_t li = hs_cur.common_prefix;
+            for (int s = 0; s < 4; ++s) {
+                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
+                    eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
+                }
+                ++li;
+            }
+        }
+        // Then we do the actual calculation
+        hellaswag_compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+
         // compute the logprobs for each ending of the decoded tasks
         for (size_t i = i0; i < i1; ++i) {
             auto & hs_cur = hs_data[i];
@@ -662,26 +727,13 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
             const auto first_probs = softmax(tok_logits);
 
-            size_t li = hs_cur.common_prefix; // logits index in the batch
-
             for (int s = 0; s < 4; ++s) {
                 hs_cur.ending_logprob_count[s] = 1;
                 hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
-
-                // Calculate the logprobs over the ending
                 for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
-                    std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + li++), n_vocab*sizeof(float));
-
-                    const float prob = softmax(tok_logits)[hs_cur.seq_tokens[s][j + 1]];
-
-                    hs_cur.ending_logprob[s] += std::log(prob);
+                    hs_cur.ending_logprob[s] += eval_results[ir++];
                     hs_cur.ending_logprob_count[s]++;
                 }
-
-                // account that we skip the last token in the ending
-                ++li;
-
-                // Calculate the mean token logprob for acc_norm
                 hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
             }
 

From b46757735d30f5c6ed4f20ebeccc684e02d4f3bf Mon Sep 17 00:00:00 2001
From: David Sommers <12738+databyte@users.noreply.github.com>
Date: Thu, 18 Jan 2024 12:20:59 -0500
Subject: [PATCH 067/138] convert.py : fix llama/llama2 conversion due to
 vocab_size=-1 (#5019)

PR #4818 (merged last week) reintroduced a config check for vocab_size that was addressed in PR #4258 (merged 2023-11-30).

Without the fix, llama2 models can't be converted. The error is:

`ValueError: The model's vocab size is set to -1 in params.json. Please update it manually. Maybe 32000?`
---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index e38ee5315..980e6fc72 100755
--- a/convert.py
+++ b/convert.py
@@ -348,7 +348,7 @@ class Params:
             f_rope_freq_base = 1e6
 
         return Params(
-            n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_vocab=model["tok_embeddings.weight"].shape[0],
             n_embd=config["dim"],
             n_layer=config["n_layers"],
             n_ctx=n_ctx,

From e9240cdfa06a50c1b5dbafa367cb8cd698e65103 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 20:45:39 +0200
Subject: [PATCH 068/138] scripts : add get-winogrande.sh

---
 scripts/get-hellaswag.sh  |  2 +-
 scripts/get-wikitext-2.sh |  7 +++++++
 scripts/get-winogrande.sh | 10 ++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100755 scripts/get-winogrande.sh

diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh
index ef8dcceb0..121979fe2 100755
--- a/scripts/get-hellaswag.sh
+++ b/scripts/get-hellaswag.sh
@@ -4,7 +4,7 @@ wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag
 
 echo "Usage:"
 echo ""
-echo "  ./perplexity --hellaswag --hellaswag-tasks N -f hellaswag_val_full.txt -m modelfile.gguf"
+echo "  ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh
index 98aec3e3e..ff96f331e 100755
--- a/scripts/get-wikitext-2.sh
+++ b/scripts/get-wikitext-2.sh
@@ -1,3 +1,10 @@
 #!/bin/bash
 
 wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo ""
+
+exit 0
diff --git a/scripts/get-winogrande.sh b/scripts/get-winogrande.sh
new file mode 100755
index 000000000..5f234468e
--- /dev/null
+++ b/scripts/get-winogrande.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
+echo ""
+
+exit 0

From d391ae9b4919e24624cc963d82162450848beaf4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 20:49:00 +0200
Subject: [PATCH 069/138] perplexity : fix winogrande N tasks option

---
 examples/perplexity/perplexity.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9498dd535..f72ea6d1c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -865,7 +865,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         }
         float scale = 1/(1.f + (float)rng.max());
         std::vector<winogrande_entry> selected;
-        selected.reserve(params.winogrande_tasks);
+        selected.resize(params.winogrande_tasks);
         for (int i = 0; i < int(params.winogrande_tasks); ++i) {
             int j = int(scale*rng()*aux.size());
             selected[i] = std::move(data[aux[j]]);

From 2d5419d08ab1131623e6a1d554607b7663435e87 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 21:45:51 +0200
Subject: [PATCH 070/138] imatrix : fix assert for src0 non-cont check

---
 examples/imatrix/imatrix.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index af78711c5..5a3d30b88 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -80,7 +80,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         // for simplicity, always copy src0 to host, because it is small
         // take into account that src0 is not contiguous!
         GGML_ASSERT(src0->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(src0));
+        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
         m_ids.resize(ggml_nbytes(src0)/sizeof(int));
         ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
 

From 96d7f56d2918ffde1995dbb32392571deb76d7fc Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 18 Jan 2024 21:12:15 +0100
Subject: [PATCH 071/138] llama : fix mlock with no-mmap with Metal (#5025)

---
 llama.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d28382f7d..f1d00a96c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1599,7 +1599,7 @@ struct llama_model {
     std::unique_ptr<llama_mmap> mapping;
 
     // objects representing data potentially being locked in memory
-    llama_mlock mlock_buf;
+    std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
     llama_mlock mlock_mmap;
 
     // for quantize-stats only
@@ -3815,8 +3815,10 @@ static bool llm_load_tensors(
         else {
             buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
-                model.mlock_buf.init   (ggml_backend_buffer_get_base(buf));
-                model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
+                model.mlock_bufs.emplace_back(new llama_mlock);
+                auto & mlock_buf = model.mlock_bufs.back();
+                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
+                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
             }
         }
         if (buf == nullptr) {

From 821f0a271e7c9ee737945245dd7abfa22cc9b5b0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 18 Jan 2024 21:33:05 +0100
Subject: [PATCH 072/138] server : defer tasks when "slot unavailable" (#5018)

* server: defer task when no slot is available

* remove unnecessary log

---------

Co-authored-by: Xuan Son Nguyen <xuanson.nguyen@snowpack.eu>
---
 examples/server/server.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 93f999298..0462fbd24 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1558,6 +1558,7 @@ struct llama_server_context
     void process_tasks()
     {
         std::unique_lock<std::mutex> lock(mutex_tasks);
+        std::vector<task_server> deferred_tasks;
         while (!queue_tasks.empty())
         {
             task_server task = queue_tasks.front();
@@ -1568,9 +1569,8 @@ struct llama_server_context
                     llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
                     if (slot == nullptr)
                     {
-                        LOG_TEE("slot unavailable\n");
-                        // send error result
-                        send_error(task, "slot unavailable");
+                        // if no slot is available, we defer this task for processing later
+                        deferred_tasks.push_back(task);
                         break;
                     }
 
@@ -1616,6 +1616,12 @@ struct llama_server_context
             }
         }
 
+        // add all the deferred tasks back the the queue
+        for (task_server &task : deferred_tasks)
+        {
+            queue_tasks.push_back(task);
+        }
+
         // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
         std::vector<task_result> agg_results;
         auto queue_iterator = queue_multitasks.begin();

From 9b6ea4263ab45e02ff905bf7a29dc143ca1facc3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 23:36:07 +0200
Subject: [PATCH 073/138] cmake : add ggml public headers (#5011)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bd640966..3fc65eaf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -846,7 +846,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
               ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 
-set(GGML_PUBLIC_HEADERS "ggml.h"
+set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
         "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
         "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
 

From 57e2a7a52a819883f40dada8a2edc24ecf48186b Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 18 Jan 2024 23:12:15 +0100
Subject: [PATCH 074/138] llama : fix falcon arch for tied output embeddings
 (#4978)

* falcon arch fix for tied output embeddings

* Update llama.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update llama.cpp

* Update llama.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index f1d00a96c..47b4384a8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3438,7 +3438,12 @@ static bool llm_load_tensors(
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab});
+                        } else {
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
+                            ml.n_created--; // artificial tensor
+                        }
                     }
 
                     for (int i = 0; i < n_layer; ++i) {

From 8b20858e5e9c44b99b4b31ae9c40b8f20d01d94f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jan 2024 10:45:06 +0200
Subject: [PATCH 075/138] perplexity : faster Winogrande via batching (#5024)

* perplexity : faster Winogrande via batching

ggml-ci

* perplexity : remove unused function

* perplexity : only tokenize selected tasks for Winogrande
---
 examples/perplexity/perplexity.cpp | 287 ++++++++++++++++-------------
 1 file changed, 160 insertions(+), 127 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index f72ea6d1c..df902fb1c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -423,26 +423,31 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     return {tokens, ppl, logit_history, prob_history};
 }
 
-static std::vector<float> evaluate_tokens(llama_context * ctx, std::vector<int> & tokens,
-        int n_past, int n_batch, int n_vocab) {
-    std::vector<float> result;
-    result.reserve(tokens.size() * n_vocab);
-    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
-    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
-        size_t n_tokens = tokens.size() - i_chunk * n_batch;
-        n_tokens = std::min(n_tokens, size_t(n_batch));
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return {};
+static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
+    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+        llama_batch batch_view = {
+            n_tokens,
+            batch.token    + i,
+            nullptr,
+            batch.pos      + i,
+            batch.n_seq_id + i,
+            batch.seq_id   + i,
+            batch.logits   + i,
+            0, 0, 0, // unused
+        };
+
+        const int ret = llama_decode(ctx, batch_view);
+        if (ret != 0) {
+            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            return false;
         }
 
-        const auto logits = llama_get_logits(ctx);
-        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
-
-        n_past += n_tokens;
+        memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
     }
-    return result;
+
+    return true;
 }
 
 static void hellaswag_compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
@@ -576,7 +581,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
         // determine the common prefix of the endings
         hs_cur.common_prefix = 0;
-        hs_cur.required_tokens = 0;
         for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
             if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
                 hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
@@ -609,45 +613,18 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int max_tasks_per_batch = params.n_parallel;
+    const int max_tasks_per_batch = 32;
     const int max_seq = 4*max_tasks_per_batch;
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_ctx*n_vocab);
+    std::vector<float> batch_logits(n_vocab*n_ctx);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
     std::vector<std::thread> workers(std::thread::hardware_concurrency());
 
-    auto decode_helper = [&](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-
-            memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
-        }
-
-        return true;
-    };
-
     for (size_t i0 = 0; i0 < hs_task_count; i0++) {
         int n_cur = 0;
 
@@ -696,7 +673,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         llama_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
-        if (!decode_helper(ctx, batch, n_batch)) {
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
             fprintf(stderr, "%s: llama_decode() failed\n", __func__);
             return;
         }
@@ -772,6 +749,13 @@ struct winogrande_entry {
     std::string second;
     std::array<std::string, 2> choices;
     int answer;
+
+    size_t i_batch;
+    size_t common_prefix;
+    size_t required_tokens;
+    size_t n_base1; // number of tokens for context + choice 1
+    size_t n_base2; // number of tokens for context + choice 2
+    std::vector<llama_token> seq_tokens[2];
 };
 
 static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
@@ -875,115 +859,164 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         data = std::move(selected);
     }
 
+    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+
     // This is needed as usual for LLaMA models
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
 
+    for (auto & task : data) {
+        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
+        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
+
+        task.common_prefix = 0;
+        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
+            if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) {
+                break;
+            }
+            task.common_prefix++;
+        }
+
+        task.required_tokens = task.common_prefix +
+            task.seq_tokens[0].size() - task.common_prefix +
+            task.seq_tokens[1].size() - task.common_prefix;
+
+        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
+        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
+    }
+
     fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int max_tasks_per_batch = 128;
+    const int max_seq = 2*max_tasks_per_batch;
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
+    std::vector<float> batch_logits(n_vocab*n_ctx);
 
     int n_correct = 0;
     int n_done    = 0;
 
-    for (size_t task_idx = 0; task_idx < data.size(); task_idx++) {
-        const auto& task = data[task_idx];
+    for (size_t i0 = 0; i0 < data.size(); i0++) {
+        int n_cur = 0;
 
-        auto base_context = ::llama_tokenize(ctx, task.first, add_bos);
-        auto base_ctx_1st = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos);
-        auto base_ctx_2nd = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos);
+        size_t i1 = i0;
+        size_t i_batch = 0;
 
-        auto sentence_1st = task.first + task.choices[0] + task.second;
-        auto sentence_2nd = task.first + task.choices[1] + task.second;
-        auto query_1st = ::llama_tokenize(ctx, sentence_1st, add_bos);
-        auto query_2nd = ::llama_tokenize(ctx, sentence_2nd, add_bos);
+        llama_batch_clear(batch);
 
-        if (query_1st.size() > (size_t)n_ctx || query_2nd.size() > (size_t)n_ctx) {
-            fprintf(stderr, "%s : number of tokens in queries %zu, %zu > n_ctxl\n", __func__, query_1st.size(), query_2nd.size());
+        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+            const int s0 = 2*(i1 - i0);
+            if (s0 + 2 > max_seq) {
+                break;
+            }
+
+            for (size_t i = 0; i < data[i1].common_prefix; ++i) {
+                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true;
+
+            for (int s = 0; s < 2; ++s) {
+                for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                }
+            }
+
+            data[i1].i_batch = i_batch;
+            i_batch += data[i1].required_tokens;
+
+            n_cur += data[i1].required_tokens;
+            if (++i1 == data.size()) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
-        auto query_1st_size = query_1st.size();
-        auto query_2nd_size = query_2nd.size();
-
-        // Speedup small evaluations by evaluating atleast 32 tokens
-        // For Winogrande this seems to slow it down rather than speed it up.
-        //if (query_1st.size() < 32) query_1st.resize(32);
-        //if (query_2nd.size() < 32) query_2nd.resize(32);
-
         llama_kv_cache_clear(ctx);
-        auto logits_1st = evaluate_tokens(ctx, query_1st, 0, params.n_batch, n_vocab);
 
-        llama_kv_cache_clear(ctx);
-        auto logits_2nd = evaluate_tokens(ctx, query_2nd, 0, params.n_batch, n_vocab);
-
-        if (logits_1st.empty() || logits_2nd.empty()) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
             return;
         }
 
-        bool skip_choice = query_1st_size - base_ctx_1st.size() > k_min_trailing_ctx &&
-                           query_2nd_size - base_ctx_2nd.size() > k_min_trailing_ctx;
+        for (size_t i = i0; i < i1; ++i) {
+            auto & task = data[i];
 
-        float score_1st = 0;
-        bool is_nan_1st = false;
-        const auto& base_1 = skip_choice ? base_ctx_1st : base_context;
-        const int last_1st = query_1st_size - base_1.size() > 1 ? 1 : 0;
-        for (size_t j = base_1.size()-1; j < query_1st_size-1-last_1st; ++j) {
-            std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float));
-            const float prob = softmax(tok_logits)[query_1st[j+1]];
-            if (std::isnan(prob) || !prob) {
-                fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
-                        prob, j, sentence_1st.c_str(), base_context.size());
-                is_nan_1st = true;
-                break;
+            const bool skip_choice =
+                task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
+                task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
+
+            float score_1st = 0;
+            bool is_nan_1st = false;
+            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
+            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
+            size_t li = n_base1 - 1;
+            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
+                std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(task.i_batch + li++), n_vocab*sizeof(float));
+                const float prob = softmax(tok_logits)[task.seq_tokens[0][j+1]];
+                if (std::isnan(prob) || !prob) {
+                    fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
+                            prob, j, (task.first + task.choices[0] + task.second).c_str(), n_base1);
+                    is_nan_1st = true;
+                    break;
+                }
+                score_1st += std::log(prob);
             }
-            score_1st += std::log(prob);
-        }
-        score_1st /= (query_1st_size - base_1.size() - last_1st);
+            score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
 
-        float score_2nd = 0;
-        bool is_nan_2nd = false;
-        const auto& base_2 = skip_choice ? base_ctx_2nd : base_context;
-        const int last_2nd = query_2nd_size - base_2.size() > 1 ? 1 : 0;
-        for (size_t j = base_2.size()-1; j < query_2nd_size-1-last_2nd; ++j) {
-            std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float));
-            const float prob = softmax(tok_logits)[query_2nd[j+1]];
-            if (std::isnan(prob) || !prob) {
-                fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
-                        prob, j, sentence_2nd.c_str(), base_context.size());
-                is_nan_2nd = true;
-                break;
+            float score_2nd = 0;
+            bool is_nan_2nd = false;
+            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
+            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
+                std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(task.i_batch + li++), n_vocab*sizeof(float));
+                const float prob = softmax(tok_logits)[task.seq_tokens[1][j+1]];
+                if (std::isnan(prob) || !prob) {
+                    fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
+                            prob, j, (task.first + task.choices[1] + task.second).c_str(), n_base2);
+                    is_nan_2nd = true;
+                    break;
+                }
+                score_2nd += std::log(prob);
             }
-            score_2nd += std::log(prob);
-        }
-        score_2nd /= (query_2nd_size - base_2.size() - last_2nd);
+            score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
 
-        if (is_nan_1st || is_nan_2nd) {
-            continue;
+            if (is_nan_1st || is_nan_2nd) {
+                continue;
+            }
+
+            if (std::isnan(score_1st) || std::isnan(score_2nd)) {
+                printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd);
+                printf("Q1: <%s> - %zu tokens\n", (task.first + task.choices[0] + task.second).c_str(), task.seq_tokens[0].size());
+                printf("Q2: <%s> - %zu tokens\n", (task.first + task.choices[1] + task.second).c_str(), task.seq_tokens[1].size());
+                printf("B : <%s> - %zu tokens\n", task.first.c_str(), task.common_prefix);
+                printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", n_base1, n_base2, skip_choice);
+                continue;
+            }
+
+            int result = score_1st > score_2nd ? 1 : 2;
+
+            if (result == task.answer) {
+                ++n_correct;
+            }
+            ++n_done;
+
+            // Print the accumulated accuracy mean x 100
+            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+            fflush(stdout);
         }
 
-        if (std::isnan(score_1st) || std::isnan(score_2nd)) {
-            printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd);
-            printf("Q1: <%s> - %zu tokens\n", sentence_1st.c_str(), query_1st_size);
-            printf("Q2: <%s> - %zu tokens\n", sentence_2nd.c_str(), query_2nd_size);
-            printf("B : <%s> - %zu tokens\n", task.first.c_str(), base_context.size());
-            printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", base_1.size(), base_2.size(), skip_choice);
-            continue;
-        }
-
-        int result = score_1st > score_2nd ? 1 : 2;
-
-        if (result == task.answer) {
-            ++n_correct;
-        }
-        ++n_done;
-
-        // Print the accumulated accuracy mean x 100
-        printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n",task_idx+1, 100.0 * n_correct/n_done,score_1st,score_2nd,result,task.answer);
-        fflush(stdout);
+        i0 = i1 - 1;
     }
 
     printf("\n");

From 993fba81807e55d27b570945af8e416d535eced1 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:02:39 +0200
Subject: [PATCH 076/138] perplexity: avoid unnecessary alloocations and logit
 copies (#5035)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index df902fb1c..292502f87 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -325,6 +325,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     double nll = 0.0;
     double nll2 = 0.0;
 
+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+
+    std::vector<float> logits;
+    if (num_batches > 1) {
+        logits.reserve((size_t)n_ctx * n_vocab);
+    }
+
     fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -333,10 +340,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
 
-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
@@ -362,8 +365,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             // restore the original token in case it was set to BOS
             tokens[batch_start] = token_org;
 
-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            if (num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
         }
 
         const auto t_end = std::chrono::high_resolution_clock::now();
@@ -392,7 +397,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         // last 256 tokens.  Then, we split the input up into context window size chunks to
         // process the entire prompt.
         const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                        workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
         count += n_ctx - first - 1;
 
@@ -406,6 +412,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
         }
         fflush(stdout);
+
+        logits.clear();
     }
     printf("\n");
 

From 2b3b999cacc7ad1207c32fbdf3479a19c06e1a34 Mon Sep 17 00:00:00 2001
From: chiranko <96988916+chiranko@users.noreply.github.com>
Date: Fri, 19 Jan 2024 17:07:27 +0800
Subject: [PATCH 077/138] llama : add CodeShell support (#5016)

* llama: add codeshell support

* llama.cpp: fix codeshell with NeoX rope

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-hf-to-gguf.py          |  67 ++++++++++++
 gguf-py/gguf/constants.py      |  19 ++++
 gguf-py/gguf/tensor_mapping.py |   1 +
 llama.cpp                      | 181 +++++++++++++++++++++++++++++++++
 4 files changed, 268 insertions(+)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 1178d63a2..aae3a5e87 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -197,6 +197,8 @@ class Model:
             return Phi2Model
         if model_architecture == "PlamoForCausalLM":
             return PlamoModel
+        if model_architecture == "CodeShellForCausalLM":
+            return CodeShellModel
         return Model
 
     def _is_model_safetensors(self) -> bool:
@@ -242,6 +244,8 @@ class Model:
             return gguf.MODEL_ARCH.PHI2
         if arch == "PlamoForCausalLM":
             return gguf.MODEL_ARCH.PLAMO
+        if arch == "CodeShellForCausalLM":
+            return gguf.MODEL_ARCH.CODESHELL
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
 
@@ -1175,6 +1179,69 @@ class PlamoModel(Model):
 
             self.gguf_writer.add_tensor(new_name, data)
 
+class CodeShellModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_name("CodeShell")
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_rope_freq_base(10000.0)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+        self.gguf_writer.add_rope_scaling_factor(1.0)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        tensors = dict(self.get_tensors())
+        has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
+        for name, data_torch in tensors.items():
+            # we don't need these
+            if name.endswith((".attn.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+            if not has_lm_head and name == "transformer.wte.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
 
 ###### CONVERSION LOGIC ######
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 972b4e9a7..95c58b419 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -99,6 +99,7 @@ class MODEL_ARCH(IntEnum):
     QWEN      = auto()
     PHI2      = auto()
     PLAMO     = auto()
+    CODESHELL = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -147,6 +148,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.QWEN:           "qwen",
     MODEL_ARCH.PHI2:           "phi2",
     MODEL_ARCH.PLAMO:          "plamo",
+    MODEL_ARCH.CODESHELL:      "codeshell",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -396,6 +398,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
     ]
     # TODO
 }
@@ -417,6 +432,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }
 
 #
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index e5b146106..de177af13 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -154,6 +154,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
             "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
             "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
+            "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
         ),
 
         # Feed-forward norm
diff --git a/llama.cpp b/llama.cpp
index 47b4384a8..1cee5a791 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -194,6 +194,7 @@ enum llm_arch {
     LLM_ARCH_QWEN,
     LLM_ARCH_PHI2,
     LLM_ARCH_PLAMO,
+    LLM_ARCH_CODESHELL,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -213,6 +214,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN,            "qwen"      },
     { LLM_ARCH_PHI2,            "phi2"      },
     { LLM_ARCH_PLAMO,           "plamo"     },
+    { LLM_ARCH_CODESHELL,       "codeshell" },
 };
 
 enum llm_kv {
@@ -600,6 +602,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_CODESHELL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
 
     {
         LLM_ARCH_UNKNOWN,
@@ -2877,6 +2899,14 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_CODESHELL:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 42: model.type = e_model::MODEL_SMALL; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
 
         default: (void)0;
     }
@@ -3784,6 +3814,42 @@ static bool llm_load_tensors(
                         layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                     }
                 } break;
+            case LLM_ARCH_CODESHELL:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -5965,6 +6031,117 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_codeshell() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(tmpq, "tmpq", il);
+                cb(tmpk, "tmpk", il);
+                cb(Vcur, "Vcur", il);
+
+                struct ggml_tensor * Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph(
@@ -6159,6 +6336,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_gpt2();
             } break;
+        case LLM_ARCH_CODESHELL:
+            {
+                result = llm.build_codeshell();
+            } break;
         default:
             GGML_ASSERT(false);
     }

From 7051aacfac0057fa5fac9ea46c55bffc3892d810 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:39:11 +0200
Subject: [PATCH 078/138] winogrande: evaluate log-probs in parallel (#5036)

This is a relatively minor performance tweak resulting in
~10% speedup on my system.

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 71 ++++++++++++++----------------
 1 file changed, 33 insertions(+), 38 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 292502f87..b07320190 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -458,7 +458,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
     return true;
 }
 
-static void hellaswag_compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
+static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
         const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
     constexpr int k_token_chunk = 4;
     if (eval_results.size() != eval_pairs.size()) {
@@ -700,7 +700,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
         }
         // Then we do the actual calculation
-        hellaswag_compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
 
         size_t ir = 0;
 
@@ -906,6 +906,10 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
     std::vector<float> tok_logits(n_vocab);
     std::vector<float> batch_logits(n_vocab*n_ctx);
 
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+
     int n_correct = 0;
     int n_done    = 0;
 
@@ -956,6 +960,30 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             return;
         }
 
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto & task = data[i];
+
+            const bool skip_choice =
+                task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
+                task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
+
+            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
+            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
+            size_t li = n_base1 - 1;
+            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
+                eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
+            }
+            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
+            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
+                eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
+            }
+        }
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
         for (size_t i = i0; i < i1; ++i) {
             auto & task = data[i];
 
@@ -964,54 +992,21 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
                 task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
 
             float score_1st = 0;
-            bool is_nan_1st = false;
             const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
             const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
-            size_t li = n_base1 - 1;
             for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
-                std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(task.i_batch + li++), n_vocab*sizeof(float));
-                const float prob = softmax(tok_logits)[task.seq_tokens[0][j+1]];
-                if (std::isnan(prob) || !prob) {
-                    fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
-                            prob, j, (task.first + task.choices[0] + task.second).c_str(), n_base1);
-                    is_nan_1st = true;
-                    break;
-                }
-                score_1st += std::log(prob);
+                score_1st += eval_results[ir++];
             }
             score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
 
             float score_2nd = 0;
-            bool is_nan_2nd = false;
             const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
             const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
-            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
             for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
-                std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(task.i_batch + li++), n_vocab*sizeof(float));
-                const float prob = softmax(tok_logits)[task.seq_tokens[1][j+1]];
-                if (std::isnan(prob) || !prob) {
-                    fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
-                            prob, j, (task.first + task.choices[1] + task.second).c_str(), n_base2);
-                    is_nan_2nd = true;
-                    break;
-                }
-                score_2nd += std::log(prob);
+                score_2nd += eval_results[ir++];
             }
             score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
 
-            if (is_nan_1st || is_nan_2nd) {
-                continue;
-            }
-
-            if (std::isnan(score_1st) || std::isnan(score_2nd)) {
-                printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd);
-                printf("Q1: <%s> - %zu tokens\n", (task.first + task.choices[0] + task.second).c_str(), task.seq_tokens[0].size());
-                printf("Q2: <%s> - %zu tokens\n", (task.first + task.choices[1] + task.second).c_str(), task.seq_tokens[1].size());
-                printf("B : <%s> - %zu tokens\n", task.first.c_str(), task.common_prefix);
-                printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", n_base1, n_base2, skip_choice);
-                continue;
-            }
-
             int result = score_1st > score_2nd ? 1 : 2;
 
             if (result == task.answer) {
@@ -1019,7 +1014,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             }
             ++n_done;
 
-            // Print the accumulated accuracy mean x 100
+            // print the accumulated accuracy mean x 100
             printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
             fflush(stdout);
         }

From de9a147df14e62f54f879d2d15e6c4793107f4fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jan 2024 13:52:22 +0200
Subject: [PATCH 079/138] py : fix flake8 lint

---
 convert-hf-to-gguf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index aae3a5e87..d2d6948d8 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1179,6 +1179,7 @@ class PlamoModel(Model):
 
             self.gguf_writer.add_tensor(new_name, data)
 
+
 class CodeShellModel(Model):
     def set_gguf_parameters(self):
         block_count = self.hparams["n_layer"]

From 9b75cb2b3ccbed3df2e14c1202168db3e5145095 Mon Sep 17 00:00:00 2001
From: Shijie <821898965@qq.com>
Date: Fri, 19 Jan 2024 19:53:13 +0800
Subject: [PATCH 080/138] llama : support upcoming Qwen2 (#5037)

---
 convert-hf-to-gguf.py     |   4 +
 gguf-py/gguf/constants.py |  16 ++++
 llama.cpp                 | 191 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 211 insertions(+)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index d2d6948d8..5cb3e63fb 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -189,6 +189,8 @@ class Model:
             return StableLMModel
         if model_architecture == "QWenLMHeadModel":
             return QwenModel
+        if model_architecture == "Qwen2ForCausalLM":
+            return Model
         if model_architecture == "MixtralForCausalLM":
             return MixtralModel
         if model_architecture == "GPT2LMHeadModel":
@@ -236,6 +238,8 @@ class Model:
             return gguf.MODEL_ARCH.STABLELM
         if arch == "QWenLMHeadModel":
             return gguf.MODEL_ARCH.QWEN
+        if arch == "Qwen2ForCausalLM":
+            return gguf.MODEL_ARCH.QWEN2
         if arch == "MixtralForCausalLM":
             return gguf.MODEL_ARCH.LLAMA
         if arch == "GPT2LMHeadModel":
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 95c58b419..2d9c33c7d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -97,6 +97,7 @@ class MODEL_ARCH(IntEnum):
     BLOOM     = auto()
     STABLELM  = auto()
     QWEN      = auto()
+    QWEN2     = auto()
     PHI2      = auto()
     PLAMO     = auto()
     CODESHELL = auto()
@@ -146,6 +147,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.BLOOM:          "bloom",
     MODEL_ARCH.STABLELM:       "stablelm",
     MODEL_ARCH.QWEN:           "qwen",
+    MODEL_ARCH.QWEN2:          "qwen2",
     MODEL_ARCH.PHI2:           "phi2",
     MODEL_ARCH.PLAMO:          "plamo",
     MODEL_ARCH.CODESHELL:      "codeshell",
@@ -358,6 +360,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.QWEN2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.PLAMO: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/llama.cpp b/llama.cpp
index 1cee5a791..90579ac85 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -192,6 +192,7 @@ enum llm_arch {
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
     LLM_ARCH_QWEN,
+    LLM_ARCH_QWEN2,
     LLM_ARCH_PHI2,
     LLM_ARCH_PLAMO,
     LLM_ARCH_CODESHELL,
@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_BLOOM,           "bloom"     },
     { LLM_ARCH_STABLELM,        "stablelm"  },
     { LLM_ARCH_QWEN,            "qwen"      },
+    { LLM_ARCH_QWEN2,           "qwen2"     },
     { LLM_ARCH_PHI2,            "phi2"      },
     { LLM_ARCH_PLAMO,           "plamo"     },
     { LLM_ARCH_CODESHELL,       "codeshell" },
@@ -568,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_QWEN2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_PHI2,
         {
@@ -2869,6 +2888,17 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_QWEN2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 24: model.type = e_model::MODEL_1B; break;
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    case 80: model.type = e_model::MODEL_70B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_PHI2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3704,6 +3734,41 @@ static bool llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2});
                     }
                 } break;
+            case LLM_ARCH_QWEN2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        // optional bias tensors
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
             case LLM_ARCH_PHI2:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5698,6 +5763,128 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                // these nodes are added to the graph together so that they are not reordered
+                // by doing so, the number of splits in the graph is reduced
+                ggml_build_forward_expand(gf, Qcur);
+                ggml_build_forward_expand(gf, Kcur);
+                ggml_build_forward_expand(gf, Vcur);
+
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL,
+                    model.layers[il].ffn_gate, NULL,
+                    model.layers[il].ffn_down, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_phi2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -6324,6 +6511,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_qwen();
             } break;
+        case LLM_ARCH_QWEN2:
+            {
+                result = llm.build_qwen2();
+            } break;
         case LLM_ARCH_PHI2:
             {
                 result = llm.build_phi2();

From a5cacb22b2114fd9adf61c00cbb237384d86bced Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jan 2024 15:24:47 +0200
Subject: [PATCH 081/138] imatrix : add README.md

---
 examples/imatrix/README.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 examples/imatrix/README.md

diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
new file mode 100644
index 000000000..578e8fc27
--- /dev/null
+++ b/examples/imatrix/README.md
@@ -0,0 +1,32 @@
+# llama.cpp/examples/imatrix
+
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
+
+## Usage
+
+```
+./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
+        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+```
+
+Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
+The parameters in square brackets are optional and have the following meaning:
+* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
+* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
+* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+
+For faster computation, make sure to use GPU offloading via the `-ngl` argument
+
+## Example
+
+```bash
+LLAMA_CUBLAS=1 make -j
+
+# generate importance matrix (imatrix.dat)
+./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+
+# use the imatrix to perform a Q4_K_M quantization
+./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+```

From 381ee195721d8e747ee31a60c0751822b3072f02 Mon Sep 17 00:00:00 2001
From: Uzo Nweke <uzoechi@gmail.com>
Date: Fri, 19 Jan 2024 13:20:50 -0500
Subject: [PATCH 082/138] finetune : fix ggml_allocr lifetimes (tmp workaround)
 (#5033)

* Fix issue with alloc causing max_compute_size to be calculated

* remove ggml_allocr_free as suggested in issue #4791
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 4a9a2340b..eee9d4de3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -263,7 +263,6 @@ static void init_model(struct my_llama_model * model) {
     model->data.resize(size + tensor_alignment);
     alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
     alloc_model(alloc, model);
-    ggml_allocr_free(alloc);
 }
 
 static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@@ -1102,7 +1101,6 @@ int main(int argc, char ** argv) {
     alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
     ggml_allocr_alloc(alloc, tokens_input);
     ggml_allocr_alloc(alloc, target_probs);
-    ggml_allocr_free(alloc);
 
     // context for compute tensors without their data
     const size_t estimated_compute_size_wo_data = (
@@ -1149,7 +1147,6 @@ int main(int argc, char ** argv) {
             best_compute_size = max_compute_size;
             best_order = gf->order;
         }
-        ggml_allocr_free(alloc);
         ggml_free(ctx_compute);
     }
     size_t max_compute_size = best_compute_size;
@@ -1177,7 +1174,6 @@ int main(int argc, char ** argv) {
         params.common.use_flash,
         params.common.use_checkpointing
     );
-    ggml_allocr_free(alloc);
 
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;

From cca894f16a5eade15afd07b015e4cb3d8658943f Mon Sep 17 00:00:00 2001
From: Kylin <56434533+KyL0N@users.noreply.github.com>
Date: Sat, 20 Jan 2024 15:01:46 +0800
Subject: [PATCH 083/138] cuda : fix compile error in jetson platform (#4975)

* cuda: fix compile error in jetson platform

* cuda: update comment in ggml-cuda.cu

* cuda: update ggml-cuda.cu comment
---
 ggml-cuda.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b2211d858..ec3837fb8 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -12,9 +12,6 @@
 #include <vector>
 #include <map>
 #include <array>
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
 
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@@ -118,6 +115,11 @@
 
 #endif // defined(GGML_USE_HIPBLAS)
 
+// ggml-cuda need half type so keep ggml headers include at last
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 
 #define CC_PASCAL     600

From 48e2b1337257bb77573bf76cfffcff3a5efa8704 Mon Sep 17 00:00:00 2001
From: adel boussaken <netdur@gmail.com>
Date: Sat, 20 Jan 2024 09:05:43 +0100
Subject: [PATCH 084/138] Add a dart/flutter binding to README.md (#4882)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 866aa87b4..cbfba01bc 100644
--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ as the main playground for developing new features for the [ggml](https://github
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
+- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 
 **UI:**
 

From 77bc1bbd05f0c31cb45773eb5eb59b9ff2b07e1b Mon Sep 17 00:00:00 2001
From: Herman Semenov <GermanAizek@yandex.ru>
Date: Sat, 20 Jan 2024 08:11:31 +0000
Subject: [PATCH 085/138] cmake : add support for ccache (#5002)

* Added support ccache for speedup recompilation

* cmake : option to disable ccache

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fc65eaf2..6b3b1396b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
+option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
 
 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@@ -561,6 +562,17 @@ if (LLAMA_LTO)
     endif()
 endif()
 
+if (LLAMA_CCACHE)
+    find_program(LLAMA_CCACHE_FOUND ccache)
+    if (LLAMA_CCACHE_FOUND)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "Using ccache")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
+    endif ()
+endif()
+
 # this version of Apple ld64 is buggy
 execute_process(
     COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v

From 6df465a91d402370bcba6676b19fad85b06ce7e0 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 20 Jan 2024 16:05:49 +0100
Subject: [PATCH 086/138] llama : run all KQV ops on the CPU with no KV offload
 (#5049)

ggml-ci
---
 ggml-backend.c |  34 +++++++-----
 llama.cpp      | 145 +++++++++++++++++++++++++++----------------------
 2 files changed, 99 insertions(+), 80 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index ef518dae0..423512def 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                 ggml_tallocr_t src_allocr = node_allocr(src);
                 GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                 if (src_allocr != node_allocr) {
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        SET_CAUSE(tensor_copy, "4.cpy");
+
+                        int n_inputs = sched->splits[cur_split].n_inputs++;
+                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+                        sched->splits[cur_split].inputs[n_inputs] = src;
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
+
+#if 0
                     // check if the input is already in the split
                     bool found = false;
                     for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                         GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                         sched->splits[cur_split].inputs[n_inputs] = src;
                     }
-
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-                    }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
+#endif
                 }
             }
         }
@@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         uint64_t compute_start_us = ggml_time_us();
         if (!sched->callback_eval) {
             ggml_backend_graph_compute(split_backend, &split->graph);
-          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
             // similar to ggml_backend_compare_graph_backend
             for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
diff --git a/llama.cpp b/llama.cpp
index 90579ac85..909ad4ad8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4315,6 +4315,7 @@ static struct ggml_tensor * llm_build_kqv(
           const llama_model & model,
         const llama_hparams & hparams,
        const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
          struct ggml_tensor * wo,
          struct ggml_tensor * wo_b,
          struct ggml_tensor * q_cur,
@@ -4393,6 +4394,8 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
     cb(cur, "kqv_merged_cont", il);
 
+    ggml_build_forward_expand(graph, cur);
+
     cur = ggml_mul_mat(ctx, wo, cur);
     if (wo_b) {
         cb(cur, "kqv_wo", il);
@@ -4405,6 +4408,44 @@ static struct ggml_tensor * llm_build_kqv(
     return cur;
 }
 
+static struct ggml_tensor * llm_build_kv(
+        struct ggml_context * ctx,
+          const llama_model & model,
+        const llama_hparams & hparams,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_mask,
+                    int64_t   n_ctx,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+                    float     max_alibi_bias,
+                    float     kq_scale,
+         const llm_build_cb & cb,
+                    int       il) {
+
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(graph, k_cur);
+    ggml_build_forward_expand(graph, v_cur);
+    ggml_build_forward_expand(graph, q_cur);
+
+    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
+
+    struct ggml_tensor * cur;
+    cur  = llm_build_kqv(ctx, model, hparams, kv, graph,
+            wo, wo_b,
+            q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
+    cb(cur, "kqv_out", il);
+
+    return cur;
+}
+
 struct llm_build_context {
     const llama_model    & model;
     const llama_hparams  & hparams;
@@ -4562,12 +4603,6 @@ struct llm_build_context {
                     cb(Vcur, "Vcur", il);
                 }
 
-                // these nodes are added to the graph together so that they are not reordered
-                // by doing so, the number of splits in the graph is reduced
-                ggml_build_forward_expand(gf, Qcur);
-                ggml_build_forward_expand(gf, Kcur);
-                ggml_build_forward_expand(gf, Vcur);
-
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
                     hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -4582,11 +4617,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -4763,14 +4796,13 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
                 // apply ALiBi for 13B model
                 const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
 
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -4892,11 +4924,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -4993,11 +5023,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5200,12 +5228,9 @@ struct llm_build_context {
                         );
                 cb(Vcur, "Vcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                // TODO: not tested, could be broken
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5292,11 +5317,9 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5390,11 +5413,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5485,11 +5506,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5597,11 +5616,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5714,11 +5731,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5837,11 +5852,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -5966,11 +5979,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -6071,11 +6082,9 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
             struct ggml_tensor * sa_out = cur;
@@ -6172,11 +6181,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -6283,11 +6290,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
 
@@ -6355,6 +6360,14 @@ static struct ggml_cgraph * llama_build_graph(
             ggml_set_name(cur, name);
         }
 
+
+        if (!lctx.cparams.offload_kqv) {
+            if (strcmp(name, "kqv_merged_cont") == 0) {
+                // all nodes between the KV store and the attention output are run on the CPU
+                ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
+            }
+        }
+
         //
         // allocate input tensors and set input data
         //

From 97c1549808d2742d37584a3c9df28154bdf34417 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Sat, 20 Jan 2024 10:08:08 -0500
Subject: [PATCH 087/138] perplexity : fix MSVC build after #5020 (#5043)

* perplexity : fix MSVC build after #5020

* try a differerent fix
---
 examples/perplexity/perplexity.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b07320190..f91f5795a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -458,23 +458,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
     return true;
 }
 
+#define K_TOKEN_CHUNK 4
+
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
         const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
-    constexpr int k_token_chunk = 4;
     if (eval_results.size() != eval_pairs.size()) {
         eval_results.resize(eval_pairs.size());
     }
     if (eval_pairs.empty()) return;
 
-    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());
+    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
 
     std::atomic<int> counter(0);
     auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
-        float local_logprobs[k_token_chunk];
+        float local_logprobs[K_TOKEN_CHUNK];
         while (true) {
-            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
+            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
             if (first >= eval_results.size()) break;
-            size_t last = std::min(first + k_token_chunk, eval_results.size());
+            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
             for (size_t i = first; i < last; ++i) {
                 auto logits = batch_logits + eval_pairs[i].first * n_vocab;
                 float max_logit = logits[0];
@@ -497,7 +498,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     for (size_t it = 0; it < max_threads; ++it) {
         workers[it].join();
     }
-
 }
 
 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

From b43ebde3b0ccbc42d9dd782b32e2fd8eb35b43b5 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Sat, 20 Jan 2024 18:14:18 -0500
Subject: [PATCH 088/138] convert : partially revert PR #4818 (#5041)

---
 convert-hf-to-gguf.py         |   9 +-
 convert-llama-ggml-to-gguf.py |  14 +-
 convert-lora-to-ggml.py       |   3 +-
 convert-persimmon-to-gguf.py  |  12 +-
 convert.py                    | 627 ++++++++++++----------------------
 mypy.ini                      |   1 +
 6 files changed, 237 insertions(+), 429 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5cb3e63fb..4d995ef78 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
 
 import numpy as np
 import torch
@@ -487,7 +487,8 @@ class MPTModel(Model):
             # map tensor names
             if "scales" in name:
                 new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name.replace("scales", "act.scales")
+                if new_name is not None:
+                    new_name = new_name.replace("scales", "act.scales")
             else:
                 new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
             if new_name is None:
@@ -904,7 +905,7 @@ class QwenModel(Model):
         return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
 
     @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
         parts = [bytes([b]) for b in token]
         while True:
             min_idx = None
@@ -1285,7 +1286,7 @@ def main() -> None:
 
     if args.awq_path:
         sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
         tmp_model_path = args.model / "weighted_model"
         dir_model = tmp_model_path
         if tmp_model_path.is_dir():
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
index e359330af..b33108062 100755
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import argparse
+import os
 import struct
 import sys
 from enum import IntEnum
@@ -9,7 +10,6 @@ from pathlib import Path
 
 import numpy as np
 
-import os
 if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
         params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
     else:
         raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
-        cfg.vocabtype)
-    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
-                               load_merges = cfg.vocabtype == 'bpe',
-                               n_vocab = vocab.vocab_size)
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
+    vocab_factory = convert.VocabFactory(vocab_path)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
     convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab
 
 
 def handle_args():
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
index 35ce152f4..4904bf128 100755
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -5,17 +5,16 @@ import json
 import os
 import struct
 import sys
+from pathlib import Path
 from typing import Any, BinaryIO, Sequence
 
 import numpy as np
 import torch
 
-from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 
-
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 
 
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
index 1ba5864dc..d2be805d1 100755
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
-import torch
-import os
-from pprint import pprint
-import sys
 import argparse
+import os
+import sys
 from pathlib import Path
+from pprint import pprint
+
+import torch
 from sentencepiece import SentencePieceProcessor
+
 if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -69,7 +71,7 @@ def main():
     persimmon_model = torch.load(args.ckpt_path)
     hparams = persimmon_model['args']
     pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
     _flatten_dict(persimmon_model['model'], tensors, None)
 
     arch = gguf.MODEL_ARCH.PERSIMMON
diff --git a/convert.py b/convert.py
index 980e6fc72..06768033d 100755
--- a/convert.py
+++ b/convert.py
@@ -17,58 +17,28 @@ import signal
 import struct
 import sys
 import time
-import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
-from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Iterable,
-    Literal,
-    Optional,
-    Tuple,
-    TypeVar,
-)
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 
-try:
-    from transformers import AutoTokenizer
-except ModuleNotFoundError as e:
-    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
 
-# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
-if "NO_LOCAL_GGUF" not in os.environ:
-    # Use absolute path to the gguf-py directory
-    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
-    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
-    if gguf_py_dir not in sys.path:
-        sys.path.insert(1, gguf_py_dir)
+if TYPE_CHECKING:
+    from typing import TypeAlias
 
-# Import gguf module
-try:
-    import gguf
-except ModuleNotFoundError as e:
-    print(f"Could not import gguf: {e}")
-    sys.exit(1)
-
-if TYPE_CHECKING:  # NOTE: This isn't necessary.
-    from typing import TypeAlias  # This can technically be omitted.
-
-if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
     faulthandler.register(signal.SIGUSR1)
 
-# NOTE: n-dimensional arrays should be directly referenced
-NDArray: TypeAlias = "np.ndarray[Any, Any]"
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 
-# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA
 
 DEFAULT_CONCURRENCY = 8
@@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
 #
 
 
-# TODO: Clean up and refactor data types
 @dataclass(frozen=True)
 class DataType:
     name: str
@@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
 
 @dataclass
 class Params:
-    n_vocab: int
-    n_embd: int
-    n_layer: int
-    n_ctx: int
-    n_ff: int
-    n_head: int
-    n_head_kv: int
-    f_norm_eps: Optional[float] = None
-    n_experts: Optional[int] = None
-    n_experts_used: Optional[int] = None
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None
 
-    rope_scaling_type: Optional[gguf.RopeScalingType] = None
-    f_rope_freq_base: Optional[float] = None
-    f_rope_scale: Optional[float] = None
-    n_orig_ctx: Optional[int] = None
-    rope_finetuned: Optional[bool] = None
+    rope_scaling_type: gguf.RopeScalingType | None = None
+    f_rope_freq_base: float | None = None
+    f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None
 
-    ftype: Optional[GGMLFileType] = None
+    ftype: GGMLFileType | None = None
 
     # path to the directory containing the model files
-    path_model: Optional[Path] = None
+    path_model: Path | None = None
 
     @staticmethod
-    def guessed(model: LazyModel) -> "Params":
+    def guessed(model: LazyModel) -> Params:
         # try transformer naming first
-        n_vocab, n_embd = (
-            model["model.embed_tokens.weight"].shape
-            if "model.embed_tokens.weight" in model
-            else model["tok_embeddings.weight"].shape
-        )
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
 
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
-            )
-        elif (
-            "model.layers.0.self_attn.W_pack.weight" in model
-        ):  # next: try baichuan naming
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"layers.{i}.attention.wq.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
         if n_layer < 1:
-            raise Exception(
-                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
 
-        n_head = n_embd // 128  # guessed
-        n_mult = 256  # guessed
+        n_head = n_embd // 128 # guessed
+        n_mult = 256           # guessed
 
         # TODO: verify this
         n_ff = int(2 * (4 * n_embd) / 3)
         n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
 
         return Params(
-            n_vocab=n_vocab,
-            n_embd=n_embd,
-            n_layer=n_layer,
-            n_ctx=-1,
-            n_ff=n_ff,
-            n_head=n_head,
-            n_head_kv=n_head,
-            f_norm_eps=1e-5,
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_layer    = n_layer,
+            n_ctx      = -1,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head,
+            f_norm_eps = 1e-5,
         )
 
     @staticmethod
-    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
         rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@@ -274,22 +223,20 @@ class Params:
                 rope_scaling_type = gguf.RopeScalingType.LINEAR
             elif typ == "yarn":
                 rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
-                rope_finetuned = rope_scaling["finetuned"]
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
             else:
-                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
 
         if "max_sequence_length" in config:
             n_ctx = config["max_sequence_length"]
         elif "max_position_embeddings" in config:
             n_ctx = config["max_position_embeddings"]
         else:
-            raise Exception(
-                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
 
-        n_experts = None
+        n_experts      = None
         n_experts_used = None
 
         if "num_local_experts" in config:
@@ -297,30 +244,30 @@ class Params:
             n_experts_used = config["num_experts_per_tok"]
 
         return Params(
-            n_vocab=config["vocab_size"],
-            n_embd=config["hidden_size"],
-            n_layer=config["num_hidden_layers"],
-            n_ctx=n_ctx,
-            n_ff=config["intermediate_size"],
-            n_head=(n_head := config["num_attention_heads"]),
-            n_head_kv=config.get("num_key_value_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["rms_norm_eps"],
-            f_rope_freq_base=config.get("rope_theta"),
-            rope_scaling_type=rope_scaling_type,
-            f_rope_scale=f_rope_scale,
-            n_orig_ctx=n_orig_ctx,
-            rope_finetuned=rope_finetuned,
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
         )
 
     # LLaMA v2 70B params.json
     # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
     @staticmethod
-    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
-        n_experts = None
+        n_experts      = None
         n_experts_used = None
         f_rope_freq_base = None
 
@@ -343,50 +290,50 @@ class Params:
 
         if config.get("moe"):
             n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts = config["moe"]["num_experts"]
+            n_experts      = config["moe"]["num_experts"]
             n_experts_used = config["moe"]["num_experts_per_tok"]
             f_rope_freq_base = 1e6
 
         return Params(
-            n_vocab=model["tok_embeddings.weight"].shape[0],
-            n_embd=config["dim"],
-            n_layer=config["n_layers"],
-            n_ctx=n_ctx,
-            n_ff=n_ff,
-            n_head=(n_head := config["n_heads"]),
-            n_head_kv=config.get("n_kv_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["norm_eps"],
-            f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_embd           = config["dim"],
+            n_layer          = config["n_layers"],
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
         )
 
     @staticmethod
-    def load(model_plus: ModelPlus) -> "Params":
-        hf_config_path = model_plus.paths[0].parent / "config.json"
+    def load(model_plus: ModelPlus) -> Params:
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
         orig_config_path = model_plus.paths[0].parent / "params.json"
 
         if hf_config_path.exists():
-            params = Params.load_transformers_config(model_plus.model, hf_config_path)
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
         elif orig_config_path.exists():
-            params = Params.load_torch_params(model_plus.model, orig_config_path)
-        elif model_plus.format != "none":
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        elif model_plus.format != 'none':
             params = Params.guessed(model_plus.model)
         else:
-            raise ValueError("Cannot guess params when model format is none")
+            raise ValueError('Cannot guess params when model format is none')
 
         params.path_model = model_plus.paths[0].parent
 
         return params
 
 
-class BpeVocab:  # GPT
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
-        self.bpe_tokenizer = json.loads(
-            open(str(fname_tokenizer), encoding="utf-8").read()
-        )
+#
+# vocab
+#
+
+class BpeVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
         self.vocab = self.bpe_tokenizer["model"]["vocab"]
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
@@ -394,34 +341,31 @@ class BpeVocab:  # GPT
             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
             # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
             if not tokenizer_json_file.is_file():
                 added_tokens = {}
             else:
                 tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                 added_tokens = dict(
-                    (item["content"], item["id"])
-                    for item in tokenizer_json.get("added_tokens", [])
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
                     # Added tokens here can be duplicates of the main vocabulary.
-                    if item["content"] not in self.bpe_tokenizer
-                )
+                    if item['content'] not in self.bpe_tokenizer)
 
         vocab_size: int = len(self.vocab)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
         if expected_ids != actual_ids:
             expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(
-                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
-            )
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
 
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens
 
     def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -442,10 +386,8 @@ class BpeVocab:  # GPT
         return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 
-class SentencePieceVocab:  # LlaMa
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
@@ -455,23 +397,19 @@ class SentencePieceVocab:  # LlaMa
 
         vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
 
-        new_tokens = {
-            id: piece for piece, id in added_tokens.items() if id >= vocab_size
-        }
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
         expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids = sorted(new_tokens.keys())
+        actual_new_ids   = sorted(new_tokens.keys())
 
         if expected_new_ids != actual_new_ids:
-            raise ValueError(
-                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
-            )
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
 
         # Token pieces that were added to the base vocabulary.
         self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base = vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
 
     def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -512,11 +450,15 @@ class SentencePieceVocab:  # LlaMa
 
 
 class HfVocab:
-    def __init__(
-        self,
-        fname_tokenizer: Path,
-        fname_added_tokens: Optional[Path] = None,
-    ) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use HfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
         print("fname_tokenizer:", fname_tokenizer)
         # Allow the tokenizer to default to slow or fast versions.
         # Explicitly set tokenizer to use local paths.
@@ -529,7 +471,7 @@ class HfVocab:
         # Initialize lists and dictionaries for added tokens
         self.added_tokens_list = []
         self.added_tokens_dict = dict()
-        self.added_tokens_ids = set()
+        self.added_tokens_ids  = set()
 
         # Process added tokens
         for tok, tokidx in sorted(
@@ -550,12 +492,12 @@ class HfVocab:
 
         # Set vocabulary sizes
         self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
 
-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
 
-    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         reverse_vocab = {
             id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
         }
@@ -573,11 +515,9 @@ class HfVocab:
                 token_id, self.special_ids  # Reuse already stored special IDs
             )
 
-    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
         # Determine token type based on whether it's a special token
-        return (
-            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-        )
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
 
     def get_token_score(self, token_id: int) -> float:
         # Placeholder for actual logic to determine the token's score
@@ -589,7 +529,6 @@ class HfVocab:
             if text in self.specials:
                 toktype = self.get_token_type(self.specials[text], self.special_ids)
                 score = self.get_token_score(self.specials[text])
-
             else:
                 toktype = gguf.TokenType.USER_DEFINED
                 score = -1000.0
@@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
     else:
         model = merge_sharded([mp.model for mp in models_plus])
 
-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
 
 
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
     CLASSES: dict[tuple[str, str], Any] = {
         # getattr used here as a workaround for mypy not being smart enough to determine
         # the staticmethods have a __func__ attribute.
-        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
-            rebuild_from_type_v2, "__func__"
-        ),
-        ("torch._utils", "_rebuild_tensor_v2"): getattr(
-            lazy_rebuild_tensor_v2, "__func__"
-        ),
-        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
-        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
-        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
-        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
-        ("torch", "Tensor"): LazyTensor,
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
     }
 
     def find_class(self, module: str, name: str) -> Any:
@@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
         executor_class = ProcessPoolExecutor
     else:
         executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
+    with executor_class(max_workers=max_workers) as executor:
         futures: list[concurrent.futures.Future[Out]] = []
         done = False
         for _ in range(concurrency):
@@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
 
 
 class OutputFile:
-    def __init__(
-        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
-    ) -> None:
-        self.gguf = gguf.GGUFWriter(
-            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
-        )
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -1036,21 +967,16 @@ class OutputFile:
         if params.n_ctx == 4096:
             name = "LLaMA v2"
         elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = str(params.path_model.parent).split('/')[-1]
 
-        self.gguf.add_name(name)
-        self.gguf.add_context_length(params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        self.gguf.add_name                (name)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
         self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count(params.n_head)
-        self.gguf.add_head_count_kv(params.n_head_kv)
-
-        if params.f_norm_eps is None:
-            raise ValueError("f_norm_eps is None")
-
-        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        self.gguf.add_head_count          (params.n_head)
+        self.gguf.add_head_count_kv       (params.n_head_kv)
 
         if params.n_experts:
             self.gguf.add_expert_count(params.n_experts)
@@ -1058,6 +984,11 @@ class OutputFile:
         if params.n_experts_used:
             self.gguf.add_expert_used_count(params.n_experts_used)
 
+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')
+
         if params.f_rope_freq_base is not None:
             self.gguf.add_rope_freq_base(params.f_rope_freq_base)
 
@@ -1089,7 +1020,7 @@ class OutputFile:
 
         return tokenizer_model
 
-    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
         tokens = []
         scores = []
         toktypes = []
@@ -1124,14 +1055,10 @@ class OutputFile:
 
     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
         n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
-        data_type = (
-            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
-        )
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
         data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(
-            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
-        )
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
 
     def write_meta(self) -> None:
         self.gguf.write_header_to_file()
@@ -1145,14 +1072,10 @@ class OutputFile:
 
     @staticmethod
     def write_vocab_only(
-        fname_out: Path,
-        params: Params,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
     ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -1180,14 +1103,8 @@ class OutputFile:
 
     @staticmethod
     def write_all(
-        fname_out: Path,
-        ftype: GGMLFileType,
-        params: Params,
-        model: LazyModel,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@@ -1207,26 +1124,19 @@ class OutputFile:
         of.write_tensor_info()
 
         # tensor data
-        ndarrays_inner = bounded_parallel_map(
-            OutputFile.do_item, model.items(), concurrency=concurrency
-        )
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
         if ftype == GGMLFileType.MostlyQ8_0:
             ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize,
-                ndarrays_inner,
-                concurrency=concurrency,
-                max_workers=concurrency,
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                 use_processpool_executor=True,
             )
         else:
             ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
 
         start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(
-            zip(model.items(), ndarrays)
-        ):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
             elapsed = time.time() - start
-            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
             print(
                 f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
     def __init__(self, path: Path):
         self.path = path
-        self.files = {
+        self.files: dict[str, Path | None] = {
             "tokenizer.model": None,
             "vocab.json": None,
             "tokenizer.json": None,
@@ -1380,24 +1290,18 @@ class VocabFactory:
                 self.files[file] = parent_file_path
         print(f"Found vocab files: {self.files}")
 
-    def _select_file(self, vocabtype: Optional[str]) -> Path:
+    def _select_file(self, vocabtype: str | None) -> Path:
         if vocabtype in ["spm", "bpe"]:
             for file_key in self.files.keys():
-                if self.files[file_key]:
-                    return self.files[file_key]
+                if (file := self.files[file_key]) is not None:
+                    return file
             raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        elif vocabtype == "hfft":
+        if vocabtype == "hfft":
             # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
             return self.path
-        else:
-            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
 
-    def _create_special_vocab(
-        self,
-        vocab: Vocab,
-        vocabtype: str,
-        model_parent_path: Path,
-    ) -> gguf.SpecialVocab:
+    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
         load_merges = vocabtype == "bpe"
         n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
         return gguf.SpecialVocab(
@@ -1407,13 +1311,12 @@ class VocabFactory:
             n_vocab=n_vocab,
         )
 
-    def load_vocab(
-        self, vocabtype: str, model_parent_path: Path
-    ) -> Tuple[Vocab, gguf.SpecialVocab]:
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
         path = self._select_file(vocabtype)
         print(f"Loading vocab file '{path}', type '{vocabtype}'")
 
         added_tokens_path = path.parent / "added_tokens.json"
+        vocab: Vocab
         if vocabtype == "bpe":
             vocab = BpeVocab(
                 path, added_tokens_path if added_tokens_path.exists() else None
@@ -1428,6 +1331,7 @@ class VocabFactory:
             )
         else:
             raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        # FIXME: Respect --vocab-dir?
         special_vocab = self._create_special_vocab(
             vocab,
             vocabtype,
@@ -1436,18 +1340,17 @@ class VocabFactory:
         return vocab, special_vocab
 
 
-def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
     namestr = {
-        GGMLFileType.AllF32: "f32",
+        GGMLFileType.AllF32:    "f32",
         GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0: "q8_0",
+        GGMLFileType.MostlyQ8_0:"q8_0",
     }[file_type]
     ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
     if ret in model_paths:
         sys.stderr.write(
             f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n"
-        )
+            "Please explicitly specify a path using --outfile.\n")
         sys.exit(1)
     return ret
 
@@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
     print(f"model_plus.format = {model_plus.format!r}")
     print(f"model_plus.vocab = {model_plus.vocab!r}")
     for name, lazy_tensor in model_plus.model.items():
-        print(
-            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
-        )
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
 
 
-def get_argument_parser() -> ArgumentParser:
+def main(args_in: list[str] | None = None) -> None:
     output_choices = ["f32", "f16"]
     if np.uint32(1) == np.uint32(1).newbyteorder("<"):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
+    vocab_types = ["spm", "bpe", "hfft"]
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
+    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
 
-    parser = argparse.ArgumentParser(
-        description="Convert a LLaMa model to a GGML compatible file"
-    )
-
-    parser.add_argument(
-        "model",
-        type=Path,
-        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
-    )
-
-    parser.add_argument(
-        "--awq-path",
-        type=Path,
-        help="Path to the Activation-aware Weight Quantization cache file",
-        default=None,
-    )
-
-    parser.add_argument(
-        "--dump",
-        action="store_true",
-        help="Display the model content without converting it",
-    )
-
-    parser.add_argument(
-        "--dump-single",
-        action="store_true",
-        help="Display the content of a single model file without conversion",
-    )
-
-    parser.add_argument(
-        "--vocab-only",
-        action="store_true",
-        help="Extract and output only the vocabulary",
-    )
-
-    parser.add_argument(
-        "--outtype",
-        choices=output_choices,
-        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
-    )
-
-    parser.add_argument(
-        "--vocab-dir",
-        type=Path,
-        help="Directory containing the tokenizer.model, if separate from the model file",
-    )
-
-    parser.add_argument(
-        "--vocab-type",
-        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
-        default="spm",
-        help="The vocabulary format used to define the tokenizer model (default: spm)",
-    )
-
-    parser.add_argument(
-        "--pad-vocab",
-        action="store_true",
-        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
-    )
-
-    parser.add_argument(
-        "--outfile",
-        type=Path,
-        help="Specify the path for the output file (default is based on input)",
-    )
-
-    parser.add_argument(
-        "--ctx", type=int, help="Model training context (default is based on input)"
-    )
-
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
-        default=DEFAULT_CONCURRENCY,
-    )
-
-    parser.add_argument(
-        "--big-endian",
-        action="store_true",
-        help="Indicate that the model is executed on a big-endian machine",
-    )
-
-    return parser
-
-
-def main(argv: Optional[list[str]] = None) -> None:
-    parser = get_argument_parser()
-    args = parser.parse_args(argv)
-
+    args = parser.parse_args(args_in)
     if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
-        from awq.apply_awq import add_scale_weights
-
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
         tmp_model_path = args.model / "weighted_model"
         if tmp_model_path.is_dir():
             print(f"{tmp_model_path} exists as a weighted model.")
@@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
     if not args.vocab_only:
         model_plus = load_some_model(args.model)
     else:
-        model_plus = ModelPlus(
-            model={}, paths=[args.model / "dummy"], format="none", vocab=None
-        )
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
 
     if args.dump:
         do_dump_model(model_plus)
         return
-
     endianess = gguf.GGUFEndian.LITTLE
     if args.big_endian:
         endianess = gguf.GGUFEndian.BIG
@@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
     params = Params.load(model_plus)
     if params.n_ctx == -1:
         if args.ctx is None:
-            raise Exception(
-                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                "Please specify one with --ctx:\n"
-                " - LLaMA v1: --ctx 2048\n"
-                " - LLaMA v2: --ctx 4096\n"
-            )
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
         params.n_ctx = args.ctx
 
     if args.outtype:
@@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
         if not args.outfile:
             raise ValueError("need --outfile if using --vocab-only")
         outfile = args.outfile
-        OutputFile.write_vocab_only(
-            outfile,
-            params,
-            vocab,
-            special_vocab,
-            endianess=endianess,
-            pad_vocab=args.pad_vocab,
-        )
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
         print(f"Wrote {outfile}")
         return
 
     if model_plus.vocab is not None and args.vocab_dir is None:
         vocab = model_plus.vocab
 
-    model = model_plus.model
-    model = convert_model_names(model, params)
-    ftype = pick_output_type(model, args.outtype)
-    model = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_output_file(model_plus.paths, ftype)
+    print(f"Vocab info: {vocab}")
+    print(f"Special vocab info: {special_vocab}")
+
+    model   = model_plus.model
+    model   = convert_model_names(model, params)
+    ftype   = pick_output_type(model, args.outtype)
+    model   = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
 
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(
-        outfile,
-        ftype,
-        params,
-        model,
-        vocab,
-        special_vocab,
-        concurrency=args.concurrency,
-        endianess=endianess,
-        pad_vocab=args.pad_vocab,
-    )
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
     print(f"Wrote {outfile}")
 
 
-if __name__ == "__main__":
-    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
+if __name__ == '__main__':
+    main()
diff --git a/mypy.ini b/mypy.ini
index 7215a05dd..e51910ca7 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -4,3 +4,4 @@ allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
 disable_error_code = import-untyped
+warn_return_any = false

From 942c0107a7301434c0a5e7da46bc4cf2393aa556 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 21 Jan 2024 05:17:27 +0200
Subject: [PATCH 089/138] flake.lock: Update (#5054)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/9b19f5e77dd906cb52dade0b7bd280339d2a1f3d' (2024-01-13)
  → 'github:NixOS/nixpkgs/bbe7d8f876fbbe7c959c90ba2ae2852220573261' (2024-01-19)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index cd532ef4f..1b253cb44 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1705133751,
-        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
+        "lastModified": 1705677747,
+        "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
+        "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
         "type": "github"
       },
       "original": {

From 726c0fa9a2da976e9c5d5c51e185d9dd453fc9e5 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 21 Jan 2024 08:01:20 +0200
Subject: [PATCH 090/138] Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/imatrix/imatrix.cpp | 74 ++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 5a3d30b88..5687476cd 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -248,7 +248,7 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
 
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
     const int n_ctx = llama_n_ctx(ctx);
@@ -269,10 +269,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     }
 
     std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
     std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
+
+    if (compute_ppl) {
+        logit_history.resize(tokens.size());
+        prob_history.resize(tokens.size());
+    }
 
     const int n_chunk_max = tokens.size() / n_ctx;
 
@@ -288,12 +290,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+
+    std::vector<float> logits;
+    if (compute_ppl && num_batches > 1) {
+        logits.reserve((size_t)n_ctx * n_vocab);
+    }
+
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
 
-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
         std::vector<float> logits;
 
         const auto t_start = std::chrono::high_resolution_clock::now();
@@ -321,8 +328,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
             // restore the original token in case it was set to BOS
             tokens[batch_start] = token_org;
 
-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            if (compute_ppl && num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
         }
 
         const auto t_end = std::chrono::high_resolution_clock::now();
@@ -338,25 +347,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
             fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
         }
 
-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
+        if (compute_ppl) {
+            const int first = n_ctx/2;
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+            count += n_ctx - first - 1;
 
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            fflush(stdout);
+
+            logits.clear();
+        }
     }
     printf("\n");
 
-    nll2 /= count;
-    nll /= count;
-    const double ppl = exp(nll);
-    nll2 -= nll * nll;
-    if (nll2 > 0) {
-        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+    if (compute_ppl) {
+        nll2 /= count;
+        nll /= count;
+        const double ppl = exp(nll);
+        nll2 -= nll * nll;
+        if (nll2 > 0) {
+            nll2 = sqrt(nll2/(count-1));
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        } else {
+            printf("Unexpected negative standard deviation of log(prob)\n");
+        }
     }
 
     return true;
@@ -365,6 +381,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
 
     StatParams sparams;
+    bool compute_ppl = true;
     std::vector<char*> args;
     args.push_back(argv[0]);
     int iarg = 1;
@@ -381,12 +398,19 @@ int main(int argc, char ** argv) {
         }
         else if (arg == "--verbosity") {
             sparams.verbosity = std::stoi(argv[++iarg]);
+        } else if (arg == "--no-ppl") {
+            compute_ppl = false;
         } else {
             args.push_back(argv[iarg]);
         }
     }
     if (iarg < argc) {
-        args.push_back(argv[iarg]);
+        std::string arg{argv[iarg]};
+        if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else {
+            args.push_back(argv[iarg]);
+        }
     }
 
     gpt_params params;
@@ -448,7 +472,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s\n", get_system_info(params).c_str());
     }
 
-    bool OK = compute_imatrix(ctx, params);
+    bool OK = compute_imatrix(ctx, params, compute_ppl);
     if (!OK) {
         return 1;
     }

From 7dcbe39d36b76389f6c5cd3b151928472b7e22ff Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 21 Jan 2024 14:42:44 +0200
Subject: [PATCH 091/138] Add ability to evauate multiple choice tasks  (#5047)

* TruthfulQA: 1st attempt, does not look like it is working

The same implementation can be used for HellaSwag as well,
so I converted a HellaSwag validation dataset to the binary
format used here and tested with that. The score is only
around 50, so something is not quite right.

* TruthfulQA: works but the result is bad

I know it works because if I convert the HellaSwag validation
data to the binary format used in the truthful_qa_score() function
I get the exact same result as from the hellaswag_score() function.
But I guess, the questions are tricky and the way I have done
the combination of question + answer is very likely not the best.
The TruthfulQA validation dataset contains 817 questions, with
random chance result around 19%. With this version I get
29.1% for Mistral-7B and 55.2% for Mistral-7B-Instruct-v0.2.
The HF leader board results for these two models are
42.2% and 68.3%, respectively.

* TruthfulQA: fix random sample

* TruthfulQA: prepare tasks in parallel for large test datasets

* Rename truthful_qa to multiple_choice

* Make MSVC happy

I had forgotten that MSVC does not make constexpr's available
inside a lambda.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.cpp                  |  31 +++
 common/common.h                    |   3 +
 examples/perplexity/perplexity.cpp | 391 ++++++++++++++++++++++++++++-
 3 files changed, 422 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ce20360a4..0e4b8bab2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.prompt_cache_all = true;
         } else if (arg == "--prompt-cache-ro") {
             params.prompt_cache_ro = true;
+        } else if (arg == "-bf" || arg == "--binary-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i], std::ios::binary);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            // store the external file name in params
+            params.prompt_file = argv[i];
+            file.seekg(0, std::ios::end);
+            size_t size = file.tellg();
+            file.seekg(0, std::ios::beg);
+            params.prompt.resize(size);
+            file.read((char *)params.prompt.data(), size);
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
         } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -689,6 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.winogrande_tasks = std::stoi(argv[i]);
+        } else if (arg == "--multiple-choice") {
+            params.multiple_choice = true;
+        } else if (arg == "--multiple-choice-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.multiple_choice_tasks = std::stoi(argv[i]);
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -888,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
     printf("  -f FNAME, --file FNAME\n");
     printf("                        prompt file to start generation.\n");
+    printf("  -bf FNAME, --binary-file FNAME\n");
+    printf("                        binary file containing multiple choice tasks.\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
     printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
+    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
index 0ae9c18b3..c69ad7e94 100644
--- a/common/common.h
+++ b/common/common.h
@@ -108,6 +108,9 @@ struct gpt_params {
     bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
     size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 
+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index f91f5795a..b7ef9a084 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -540,14 +540,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     // This is needed as usual for LLaMA models
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
 
+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
     // Number of tasks to use when computing the score
     if (params.hellaswag_tasks < hs_task_count) {
         hs_task_count = params.hellaswag_tasks;
     }
 
-    // The tasks should be randomized so the score stabilizes quickly.
-    bool randomize_tasks = true;
-
     // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
     std::mt19937 rng(1);
 
@@ -1031,6 +1031,389 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
     printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 
+static bool deserialize_string(std::istream& in, std::string& str) {
+    uint32_t size;
+    if (!in.read((char *)&size, sizeof(size)).fail()) {
+        str.resize(size);
+        if (!in.read((char *)str.data(), size).fail()) return true;
+    }
+    return false;
+}
+
+struct multiple_choice_answers {
+    std::vector<std::string> answers;
+    std::vector<int>         labels;
+    bool deserialize(std::istream& in) {
+        uint32_t n;
+        in.read((char *)&n, sizeof(n));
+        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
+        answers.resize(n);
+        labels.resize(n);
+        for (auto& a : answers) {
+            if (!deserialize_string(in, a)) return false;
+        }
+        in.read((char *)labels.data(), n*sizeof(int));
+        return !in.fail();
+    }
+};
+
+struct multiple_choice_task {
+    std::string question;         // the question (or context that needs to be continued)
+    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
+    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
+    bool deserialize(std::istream& in) {
+        if (!deserialize_string(in, question)) return false;
+        return mc1.deserialize(in) && mc2.deserialize(in);
+    }
+
+    // For evaluation
+    size_t i_batch;         // starting index in the llama_batch
+    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
+    size_t required_tokens; // needed number of tokens to evaluate all answers
+    std::vector<std::vector<llama_token>> seq_tokens;
+    std::vector<float> log_probs;
+};
+
+static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
+    if (task.question.empty() || task.mc1.answers.empty()) {
+        if (log_error) {
+            printf("%s: found bad task with empty question and/or answers\n", __func__);
+        }
+        return false;
+    }
+    task.seq_tokens.reserve(task.mc1.answers.size());
+    for (auto& answer : task.mc1.answers) {
+        if (answer.empty()) {
+            if (log_error) {
+                printf("%s: found empty answer\n", __func__);
+            }
+            return false;
+        }
+        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
+    }
+    auto min_len = task.seq_tokens.front().size();
+    for (auto& seq : task.seq_tokens) {
+        min_len = std::min(min_len, seq.size());
+    }
+    task.common_prefix = 0;
+    for (size_t k = 0; k < min_len; ++k) {
+        auto token = task.seq_tokens[0][k];
+        bool all_same = true;
+        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
+            if (task.seq_tokens[i][k] != token) {
+                all_same = false;
+                break;
+            }
+        }
+        if (!all_same) {
+            break;
+        }
+        ++task.common_prefix;
+    }
+    task.required_tokens = task.common_prefix;
+    for (auto& seq : task.seq_tokens) {
+        task.required_tokens += seq.size() - task.common_prefix;
+    }
+    return true;
+}
+
+//
+// Calculates score for multiple choice tasks with single correct answer from prompt.
+// Commonly used LLM evaluation metrics of this type are
+//   * ARC
+//   * HellaSwag
+//   * MMLU
+//   * TruthfulQA
+//
+// Validation datasets for these 4 tests can be found at
+//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
+// The data for these datasets was extracted from
+//     git@hf.co:datasets/allenai/ai2_arc
+//     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+//     git@hf.co:datasets/Stevross/mmlu
+//     https://huggingface.co/datasets/truthful_qa
+//
+static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
+
+    std::istringstream strstream(params.prompt);
+    uint32_t n_task;
+    strstream.read((char *)&n_task, sizeof(n_task));
+    if (strstream.fail() || n_task == 0) {
+        printf("%s: no tasks\n", __func__);
+        return;
+    }
+    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    std::vector<uint32_t> task_pos(n_task);
+    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
+    if (strstream.fail()) {
+        printf("%s: failed to raad task positions from prompt\n", __func__);
+        return;
+    }
+
+    std::vector<multiple_choice_task> tasks;
+    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
+        // Use all tasks
+        tasks.resize(n_task);
+        printf("%s: reading tasks", __func__);
+        int n_dot = n_task/100;
+        int i = 0;
+        for (auto& task : tasks) {
+            ++i;
+            if (!task.deserialize(strstream)) {
+                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                return;
+            }
+            if (i%n_dot == 0) printf(".");
+        }
+        printf("done\n");
+    }
+    else {
+        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        std::mt19937 rng(1);
+        std::vector<int> aux(n_task);
+        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
+        float scale = 1.f/(1.f + (float)std::mt19937::max());
+        tasks.resize(params.multiple_choice_tasks);
+        for (auto& task : tasks) {
+            int j = (int)(scale * rng() * aux.size());
+            int idx = aux[j];
+            aux[j] = aux.back();
+            aux.pop_back();
+            strstream.seekg(task_pos[idx], std::ios::beg);
+            if (!task.deserialize(strstream)) {
+                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                return;
+            }
+        }
+        n_task = params.multiple_choice_tasks;
+    }
+
+    // This is needed as usual for LLaMA models
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    printf("%s: preparing task data", __func__);
+    fflush(stdout);
+    if (n_task > 500) {
+        printf("...");
+        fflush(stdout);
+        std::atomic<int> counter(0);
+        std::atomic<int> n_bad(0);
+        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
+            int num_tasks = tasks.size();
+            int n_bad_local = 0;
+            while (true) {
+                int first = counter.fetch_add(K_TOKEN_CHUNK);
+                if (first >= num_tasks) {
+                    if (n_bad_local > 0) n_bad += n_bad_local;
+                    break;
+                }
+                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
+                for (int i = first; i < last; ++i) {
+                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                }
+            }
+        };
+        size_t max_thread = std::thread::hardware_concurrency();
+        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
+        std::vector<std::thread> workers(max_thread-1);
+        for (auto& w : workers) w = std::thread(prepare);
+        prepare();
+        for (auto& w : workers) w.join();
+        printf("done\n");
+        fflush(stdout);
+        int nbad = n_bad;
+        if (nbad > 0) {
+            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            return;
+        }
+    } else {
+        int n_dot = n_task/100;
+        int i_task = 0;
+        for (auto& task : tasks) {
+            ++i_task;
+            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
+                return;
+            }
+            if (i_task%n_dot == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+        printf("done\n");
+    }
+
+    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+
+    printf("\ntask\tacc_norm\n");
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int max_tasks_per_batch = 32;
+    const int max_seq = 4*max_tasks_per_batch;
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+
+    std::vector<float> tok_logits(n_vocab);
+    std::vector<float> batch_logits(n_vocab*n_ctx);
+
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+    std::vector<int> batch_indeces;
+
+    int n_done = 0;
+    int n_correct = 0;
+    int n_tot_answers = 0;
+
+    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+
+        llama_batch_clear(batch);
+
+        // batch as much tasks as possible into the available context
+        // each task has 4 unique seuqnce ids - one for each ending
+        // the common prefix is shared among the 4 sequences to save tokens
+        // we extract logits only from the last common token and from all ending tokens of each sequence
+        int s0 = 0;
+        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
+            auto& cur_task = tasks[i1];
+
+            int num_answers = cur_task.seq_tokens.size();
+            if (s0 + num_answers > max_seq) {
+                break;
+            }
+
+            if (int(batch_indeces.size()) != num_answers) {
+                batch_indeces.resize(num_answers);
+            }
+            for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
+
+            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
+                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
+                }
+            }
+
+            s0 += num_answers;
+
+            cur_task.i_batch = i_batch;
+            i_batch += cur_task.required_tokens;
+
+            n_cur += cur_task.required_tokens;
+            if (++i1 == tasks.size()) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            return;
+        }
+
+        llama_kv_cache_clear(ctx);
+
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        // Compute log-probs in parallel
+        // First we collect all tasks
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto& cur_task = tasks[i];
+            size_t li = cur_task.common_prefix;
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
+                }
+                ++li;
+            }
+        }
+        // Then we do the actual calculation
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+
+        // compute the logprobs for each ending of the decoded tasks
+        for (size_t i = i0; i < i1; ++i) {
+            auto & cur_task = tasks[i];
+            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
+            //    if (cur_task.mc1.labels[j] == 1) {
+            //        printf("%d", j+1);
+            //    }
+            //}
+            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
+
+            const auto first_probs = softmax(tok_logits);
+
+            cur_task.log_probs.resize(cur_task.seq_tokens.size());
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                size_t count = 1;
+                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    ++count;
+                    log_prob += eval_results[ir++];
+                }
+                cur_task.log_probs[s] = log_prob / count;
+                //printf("        Final: %g\n", log_prob / count);
+                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+            }
+
+            // Find the ending with maximum logprob
+            size_t logprob_max_idx = 0;
+            float  logprob_max_val = cur_task.log_probs[0];
+            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
+                if (cur_task.log_probs[s] > logprob_max_val) {
+                    logprob_max_val = cur_task.log_probs[s];
+                    logprob_max_idx = s;
+                }
+            }
+
+            n_tot_answers += cur_task.log_probs.size();
+            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
+                ++n_correct;
+            }
+            ++n_done;
+
+            // Print the accumulated accuracy mean x 100
+            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            fflush(stdout);
+        }
+
+        i0 = i1 - 1;
+    }
+
+    llama_batch_free(batch);
+
+    if (n_done < 100) return;
+
+    float p = 1.f*n_correct/n_done;
+    float sigma = sqrt(p*(1-p)/(n_done-1));
+    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    p = 1.f*n_done/n_tot_answers;
+    sigma = sqrt(p*(1-p)/(n_done-1));
+    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+
+    printf("\n");
+}
+
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -1091,6 +1474,8 @@ int main(int argc, char ** argv) {
         hellaswag_score(ctx, params);
     } else if (params.winogrande) {
         winogrande_score(ctx, params);
+    } else if (params.multiple_choice) {
+        multiple_choice_score(ctx, params);
     } else {
         results = perplexity(ctx, params);
     }

From 6c5629d4d2d15557c38a0e609b30c1a42abad80d Mon Sep 17 00:00:00 2001
From: bobqianic <129547291+bobqianic@users.noreply.github.com>
Date: Sun, 21 Jan 2024 15:17:35 +0000
Subject: [PATCH 092/138] add `#include <string>` to unicode.h (#5051)

Co-authored-by: Jared Van Bortel <jared@nomic.ai>
---
 unicode.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unicode.h b/unicode.h
index aeca879ea..844eff3da 100644
--- a/unicode.h
+++ b/unicode.h
@@ -2,8 +2,9 @@
 
 #include <cassert>
 #include <stdexcept>
-#include <vector>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},

From 05490fad7f7f60ff2bed9ad05cd81b44e82ccde3 Mon Sep 17 00:00:00 2001
From: kuronekosaiko <EvanChanJ@163.com>
Date: Mon, 22 Jan 2024 00:28:14 +0800
Subject: [PATCH 093/138] add safetensors support to convert-lora-to-ggml.py
 (#5062)

* add safetensors support to convert-lora-to-ggml.py

* Update convert-lora-to-ggml.py

Remove white space in line 69.
---
 convert-lora-to-ggml.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
index 4904bf128..9a9936dec 100755
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -59,7 +59,14 @@ if __name__ == '__main__':
     input_model = os.path.join(sys.argv[1], "adapter_model.bin")
     output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 
-    model = torch.load(input_model, map_location="cpu")
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
     arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
 
     if arch_name not in gguf.MODEL_ARCH_NAMES.values():

From 504dc37be8446fb09b1ede70300250ad41be32a2 Mon Sep 17 00:00:00 2001
From: iSma <ismail.senhaji@gmail.com>
Date: Sun, 21 Jan 2024 22:37:13 +0100
Subject: [PATCH 094/138] Revert LLAMA_NATIVE to OFF in flake.nix (#5066)

---
 .devops/nix/package.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 43bdbd755..1ef2c6cd9 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -159,7 +159,7 @@ effectiveStdenv.mkDerivation (
 
     cmakeFlags =
       [
-        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_NATIVE" false)
         (cmakeBool "LLAMA_BUILD_SERVER" true)
         (cmakeBool "BUILD_SHARED_LIBS" true)
         (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)

From 3466c6ebcf668cac453f891b493ead19283347a8 Mon Sep 17 00:00:00 2001
From: Shijie <821898965@qq.com>
Date: Mon, 22 Jan 2024 15:33:19 +0800
Subject: [PATCH 095/138] llama : add more qwen2 models (#5071)

---
 llama.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 909ad4ad8..9ad74d735 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1325,8 +1325,10 @@ static llama_state g_state;
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,
+    MODEL_0_5B,
     MODEL_1B,
     MODEL_3B,
+    MODEL_4B,
     MODEL_7B,
     MODEL_8B,
     MODEL_13B,
@@ -2892,9 +2894,9 @@ static void llm_load_hparams(
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
+                    case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
                     case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
+                    case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
                     case 80: model.type = e_model::MODEL_70B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }

From 57744932c64266359ee905518de7e096c0295d8c Mon Sep 17 00:00:00 2001
From: bobqianic <129547291+bobqianic@users.noreply.github.com>
Date: Mon, 22 Jan 2024 08:55:05 +0000
Subject: [PATCH 096/138] ci : fix Windows CI by updating Intel SDE version
 (#5053)

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 367df07a7..c3aa6f992 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,7 +295,7 @@ jobs:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17
       CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07
 
     strategy:
       matrix:
@@ -400,7 +400,7 @@ jobs:
         id: cmake_test_sde
         if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
         run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
           # for some weird reason windows tar doesn't like sde tar.xz
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar

From 66d575c45c5a370d668f9c3283cdf348e2329fa2 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 22 Jan 2024 12:43:33 +0200
Subject: [PATCH 097/138] llama : add Q3_K_XS (#5060)

* Add Q3_K_XS - intermediate size between Q2_K and Q3_K_S

* Q3_K_XS: quanize first 1/8 of ffn_down layers with Q4_K

Together with an importance matrix, this brings perplexity
for LLaMA-v2-70B below the perplexity of the former Q2_K
with a 800 MB smaller quantized model size.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/quantize/quantize.cpp |  1 +
 llama.cpp                      | 62 +++++++++++++++++++++++++---------
 llama.h                        |  1 +
 3 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 2ae046933..f4786157e 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
     { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
     { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
     { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
diff --git a/llama.cpp b/llama.cpp
index 9ad74d735..c56d31163 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2661,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
         case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
 
         default: return "unknown, may not work";
     }
@@ -8765,9 +8766,13 @@ struct quantize_state_internal {
     const llama_model_quantize_params * params;
 
     int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
+    int n_ffn_down        = 0;
+    int n_ffn_gate        = 0;
+    int n_ffn_up          = 0;
     int i_attention_wv    = 0;
-    int i_feed_forward_w2 = 0;
+    int i_ffn_down        = 0;
+    int i_ffn_gate        = 0;
+    int i_ffn_up          = 0;
 
     int n_k_quantized     = 0;
     int n_fallback        = 0;
@@ -8870,8 +8875,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             ++qs.i_attention_wv;
         }
         else if (name.find("ffn_down") != std::string::npos) {
-            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
-            ++qs.i_feed_forward_w2;
+            if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
+            ++qs.i_ffn_down;
         }
         else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
     } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -8908,18 +8913,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
+            new_type = GGML_TYPE_Q2_K;
+        }
     } else if (name.find("ffn_down") != std::string::npos) {
         const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
         int i_layer, n_layer;
         if (n_expert == 1) {
-            i_layer = qs.i_feed_forward_w2;
-            n_layer = qs.n_feed_forward_w2;
+            i_layer = qs.i_ffn_down;
+            n_layer = qs.n_ffn_down;
         } else {
             // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
-            // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
+            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
             // for getting the current layer as I initially thought, and we need to resort to parsing the
             // tensor name.
-            n_layer = qs.n_feed_forward_w2 / n_expert;
+            n_layer = qs.n_ffn_down / n_expert;
             if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
                 throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
             }
@@ -8928,7 +8936,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             }
         }
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
             if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8958,11 +8966,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
             new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
-        ++qs.i_feed_forward_w2;
+        ++qs.i_ffn_down;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert == 8) {
-                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
                     new_type = GGML_TYPE_Q5_K;
                 }
@@ -8980,6 +8989,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
+    else if (name.find("ffn_gate") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
+            new_type = GGML_TYPE_Q2_K;
+        }
+        ++qs.i_ffn_gate;
+    }
+    else if (name.find("ffn_up") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
+            new_type = GGML_TYPE_Q2_K;
+        }
+        ++qs.i_ffn_up;
+    }
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
     // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
     //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -9034,8 +9057,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break;
 
         // K-quants
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:
         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
+        case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
         case LLAMA_FTYPE_MOSTLY_Q3_K_S:
         case LLAMA_FTYPE_MOSTLY_Q3_K_M:
         case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9103,12 +9127,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             ++qs.n_attention_wv;
         }
         else if (name.find("ffn_down") != std::string::npos) {
-            ++qs.n_feed_forward_w2;
+            ++qs.n_ffn_down;
+        }
+        else if (name.find("ffn_gate") != std::string::npos) {
+            ++qs.n_ffn_gate;
+        }
+        else if (name.find("ffn_up") != std::string::npos) {
+            ++qs.n_ffn_up;
         }
     }
-    if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
-        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
-                __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
+    if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
+                __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
     }
 
     size_t total_size_org = 0;
diff --git a/llama.h b/llama.h
index e268d7a1d..bb6054557 100644
--- a/llama.h
+++ b/llama.h
@@ -107,6 +107,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };

From 152d9d05e097e35f1cac21262946d57faec7542a Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 22 Jan 2024 12:11:01 +0100
Subject: [PATCH 098/138] finetune : print sample-start/include-sample-start
 (#5072)

This commit adds `--sample-start` and `--include-sample-start` to the
output from the main function in finetune.cpp.

The motivation for this is that even though these are set explicitly by
the user via the command line, if one forgets to set them then it is
useful to have their values printed out. Otherwise it is possible to go
through the whole training process before realizing that the values are
not what one expected.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/finetune/finetune.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 11fcbf443..b7e19c5fe 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
     std::vector<size_t> train_samples_begin;
     std::vector<size_t> train_samples_size;
     printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
+    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
+    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
     tokenize_file(lctx,
             params.common.fn_train_data,
             params.common.sample_start,

From d6bd4d46ddb6926087c11e0f6633ab1c81da58c3 Mon Sep 17 00:00:00 2001
From: compilade <113953597+compilade@users.noreply.github.com>
Date: Mon, 22 Jan 2024 06:21:52 -0500
Subject: [PATCH 099/138] llama : support StableLM 2 1.6B (#5052)

* llama : support StableLM 2 1.6B

* convert : fix Qwen's set_vocab wrongly naming all special tokens [PAD{id}]

* convert : refactor Qwen's set_vocab to use it for StableLM 2 too

* nix : add tiktoken to llama-python-extra

* convert : use presence of tokenizer.json to determine StableLM tokenizer loader

It's a less arbitrary heuristic than the vocab size.
---
 .devops/nix/package.nix |   1 +
 convert-hf-to-gguf.py   | 106 +++++++++++++++++++++++-----------------
 llama.cpp               |  18 +++++++
 3 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 1ef2c6cd9..e8534956f 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -73,6 +73,7 @@ let
     ps: [
       ps.numpy
       ps.sentencepiece
+      ps.tiktoken
       ps.torchWithoutCuda
       ps.transformers
     ]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 4d995ef78..7a0a8c3db 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -289,6 +289,58 @@ class Model:
         special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_qwen(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[QwenModel.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
+        added_vocab = tokenizer.special_tokens
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_sentencepiece(self):
         from sentencepiece import SentencePieceProcessor
 
@@ -877,6 +929,13 @@ class PersimmonModel(Model):
 
 
 class StableLMModel(Model):
+    def set_vocab(self):
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        else:
+            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
+            self._set_vocab_qwen()
+
     def set_gguf_parameters(self):
         hparams = self.hparams
         block_count = hparams["num_hidden_layers"]
@@ -922,52 +981,7 @@ class QwenModel(Model):
         return parts
 
     def set_vocab(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[self.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
-
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
-        added_vocab = tokenizer.special_tokens
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
+        self._set_vocab_qwen()
 
     def set_gguf_parameters(self):
         self.gguf_writer.add_name("Qwen")
diff --git a/llama.cpp b/llama.cpp
index c56d31163..8c906a22f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2877,6 +2877,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
+                    case 24: model.type = e_model::MODEL_1B; break;
                     case 32: model.type = e_model::MODEL_3B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
@@ -3700,6 +3701,11 @@ static bool llm_load_tensors(
                         layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
+                        // optional bias tensors, present in Stable LM 2 1.6B
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     false);
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, false);
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, false);
+
                         layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                         layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
 
@@ -5598,12 +5604,24 @@ struct llm_build_context {
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
 
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
 
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
 
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,

From 15bceec2d73d4166340b46b27677c45ac1b4cdad Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:18:43 +0200
Subject: [PATCH 100/138] imatrix : keep intermediate imatrix results (#5077)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/imatrix/imatrix.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 5687476cd..ea06fcdbf 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -26,6 +26,7 @@ struct StatParams {
     std::string ofile = "imatrix.dat";
     int         n_output_frequency = 10;
     int         verbosity = 1;
+    int         keep_every = 0;
     bool        collect_output_weight = false;
 };
 
@@ -42,6 +43,9 @@ private:
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
     std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+                                                  //
+    void save_imatrix(const char * file_name) const;
+    void keep_imatrix(int ncall) const;
 };
 
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                 if (m_last_call % m_params.n_output_frequency == 0) {
                     save_imatrix();
                 }
+                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                    keep_imatrix(m_last_call);
+                }
             }
         }
     } else {
@@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             if (m_last_call % m_params.n_output_frequency == 0) {
                 save_imatrix();
             }
+            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                keep_imatrix(m_last_call);
+            }
         }
     }
 
@@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }
 
 void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+}
+
+void IMatrixCollector::keep_imatrix(int ncall) const {
+    auto file_name = m_params.ofile;
+    if (file_name.empty()) file_name = "imatrix.dat";
+    file_name += ".at_";
+    file_name += std::to_string(ncall);
+    save_imatrix(file_name.c_str());
+}
+
+void IMatrixCollector::save_imatrix(const char * fname) const {
     std::ofstream out(fname, std::ios::binary);
     int n_entries = m_stats.size();
     out.write((const char*)&n_entries, sizeof(n_entries));
@@ -400,6 +421,8 @@ int main(int argc, char ** argv) {
             sparams.verbosity = std::stoi(argv[++iarg]);
         } else if (arg == "--no-ppl") {
             compute_ppl = false;
+        } else if (arg == "--keep-imatrix") {
+            sparams.keep_every = std::stoi(argv[++iarg]);
         } else {
             args.push_back(argv[iarg]);
         }

From f7276f7500f7ea588836dd1fc6f126334c517878 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 13 Jan 2024 17:10:19 +0000
Subject: [PATCH 101/138] workflows: nix-ci: rebuild on flake.lock updates

---
 .github/workflows/nix-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 845b93bfb..4592b9607 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,10 +5,10 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix', 'flake.lock']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix', 'flake.lock']
 
 jobs:
   nix-eval:

From f4dd059259d0234913b9e9780e1662811744c09d Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 13 Jan 2024 17:16:54 +0000
Subject: [PATCH 102/138] workflows: nix-build-aarch64: rate limit

---
 .github/workflows/nix-ci-aarch64.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index be7c26d40..0c6cf5f09 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -2,13 +2,20 @@ name: Nix aarch64 builds
 
 on:
   workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
+    # 1.5h instead of minutes with the cold cache).
+    #
+    # randint(0, 59), randint(0, 23)
+    - cron: '26 12 * * *'
+  # But also rebuild if we touched any of the Nix expressions:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
 
 jobs:
   nix-build-aarch64:

From fe8b3c0d4b0d806e8b46660e24eaf4b90b8b385f Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 13 Jan 2024 17:38:32 +0000
Subject: [PATCH 103/138] workflows: nix-ci: drop the redundant "paths" filter

---
 .github/workflows/nix-ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 4592b9607..d19c7a576 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,10 +5,8 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix', 'flake.lock']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix', 'flake.lock']
 
 jobs:
   nix-eval:

From 7251870780e2d572dd6f239d7a0bfe438c82fa74 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sat, 13 Jan 2024 17:45:01 +0000
Subject: [PATCH 104/138] nix: refactor the cleanSource rules

---
 .devops/nix/package.nix | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index e8534956f..c25d99f01 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -115,14 +115,22 @@ effectiveStdenv.mkDerivation (
     pname = "llama-cpp${pnameSuffix}";
     version = llamaVersion;
 
+    # Note: none of the files discarded here are visible in the sandbox or
+    # affect the output hash. This also means they can be modified without
+    # triggering a rebuild.
     src = lib.cleanSourceWith {
       filter =
         name: type:
-        !(builtins.any (_: _) [
+        let
+          noneOf = builtins.all (x: !x);
+          baseName = baseNameOf name;
+        in
+        noneOf [
           (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
-        ]);
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
+          (baseName == "flake.lock")
+        ];
       src = lib.cleanSource ../../.;
     };
 

From 5e97ec91ae3038720a5b15cde4c52d2a53ec2137 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 21 Jan 2024 03:15:13 +0000
Subject: [PATCH 105/138] nix: add a comment about makeScope

---
 .devops/nix/scope.nix | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix
index 7932ac1e8..d295995a4 100644
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -4,6 +4,10 @@
   llamaVersion ? "0.0.0",
 }:
 
+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+
 lib.makeScope newScope (
   self: {
     inherit llamaVersion;

From 28603cd2833cedd4434f398d847f87fc83546dbb Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 21 Jan 2024 03:29:38 +0000
Subject: [PATCH 106/138] nix: add a comment on the many nixpkgs-with-cuda
 instances

---
 .devops/nix/nixpkgs-instances.nix | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix
index 6e9872b28..4a2f81c4b 100644
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -7,6 +7,18 @@
     { system, ... }:
     {
       _module.args = {
+        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+        # again, the below creates several nixpkgs instances which the
+        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+        #
+        # This is currently "slow" and "expensive", on a certain scale.
+        # This also isn't "right" in that this hinders dependency injection at
+        # the level of flake inputs. This might get removed in the foreseeable
+        # future.
+        #
+        # Note that you can use these expressions without Nix
+        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+
         pkgsCuda = import inputs.nixpkgs {
           inherit system;
           # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,

From b2d80e105a59b54822edf7ce7f3ed5f317e96e21 Mon Sep 17 00:00:00 2001
From: Someone Serge <sergei.kozlukov@aalto.fi>
Date: Sun, 21 Jan 2024 03:41:37 +0000
Subject: [PATCH 107/138] flake.nix: add a comment about flakes vs nix

---
 flake.nix | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/flake.nix b/flake.nix
index ec62c773a..a776ba024 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,3 +1,17 @@
+# The flake interface to llama.cpp's Nix expressions. The flake is used as a
+# more discoverable entry-point, as well as a way to pin the dependencies and
+# expose default outputs, including the outputs built by the CI.
+
+# For more serious applications involving some kind of customization  you may
+# want to consider consuming the overlay, or instantiating `llamaPackages`
+# directly:
+#
+# ```nix
+# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
+# ```
+
+# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
+# of the relation between Nix and the Nix Flakes.
 {
   description = "Port of Facebook's LLaMA model in C/C++";
 

From 3ce7e8f8e7ccfce07e5947ac5f1f3f4628cf68ea Mon Sep 17 00:00:00 2001
From: XiaotaoChen <chenxiaotao1234@gmail.com>
Date: Mon, 22 Jan 2024 21:09:35 +0800
Subject: [PATCH 108/138] llava : MobileVLM support (#4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>
---
 CMakeLists.txt                                |   7 +
 examples/llava/MobileVLM-README.md            | 131 ++++++
 examples/llava/android/adb_run.sh             |  53 +++
 examples/llava/android/build_64.sh            |   8 +
 examples/llava/clip.cpp                       | 391 +++++++++++++++++-
 .../llava/convert-image-encoder-to-gguf.py    |   6 +-
 ggml.c                                        | 141 ++++++-
 ggml.h                                        |  24 ++
 8 files changed, 737 insertions(+), 24 deletions(-)
 create mode 100644 examples/llava/MobileVLM-README.md
 create mode 100755 examples/llava/android/adb_run.sh
 create mode 100755 examples/llava/android/build_64.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b3b1396b..5a333ff52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,13 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 
+
+# add perf arguments
+option(LLAMA_PERF                            "llama: enable perf"                               OFF)
+if (LLAMA_PERF)
+    add_definitions(-DGGML_PERF)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
new file mode 100644
index 000000000..c6258eba6
--- /dev/null
+++ b/examples/llava/MobileVLM-README.md
@@ -0,0 +1,131 @@
+# MobileVLM
+
+Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+
+for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
+
+The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
+
+## Usage
+Build with cmake or run `make llava-cli` to build it.
+
+After building, run: `./llava-cli` to see the usage. For example:
+
+```sh
+./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
+    --image path/to/an/image.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+```
+
+## Model conversion
+
+- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+
+```sh
+git clone https://huggingface.co/mtgv/MobileVLM-1.7B
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+```
+
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B \
+    --projector-type ldp
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py path/to/MobileVLM-1.7B
+```
+
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+```sh
+./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+```
+
+Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+
+## Android compile and run
+### compile
+refer to `examples/llava/android/build_64.sh`
+```sh
+mkdir examples/llava/android/build_64
+cd examples/llava/android/build_64
+../build_64.sh
+```
+### run on Android
+refer to `android/adb_run.sh`, modify resources' `name` and `path`
+
+## some result on Android with `Snapdragon 888` chip
+### case 1
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
+ Susan Wise Bauer
+llama_print_timings:        load time =   23574.72 ms
+llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
+llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
+llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
+llama_print_timings:       total time =   34731.93 ms
+```
+### case 2
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/cat.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+```
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
+ The image depicts a cat sitting in the grass near some tall green plants.
+llama_print_timings:        load time =   23257.32 ms
+llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
+llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
+llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
+llama_print_timings:       total time =   34570.79 ms
+```
+
+## Minor shortcomings
+The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+
+## TODO
+
+- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
+- [ ] Optimize LDP projector performance
+
+      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
+      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
+- [ ] run MobileVLM on `Jetson Orin`
+- [ ] Support more model variants, such as `MobileVLM-3B`.
+
+
+## contributor
+```sh
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+```
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
new file mode 100755
index 000000000..f73623ae3
--- /dev/null
+++ b/examples/llava/android/adb_run.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
+projector_name="mmproj-model-f16.gguf"
+llama_name="ggml-model-q4_k.gguf"
+img_dir="/Users/cxt/model/llm"
+img_name="demo.jpg"
+prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+# img_name="cat.jpeg"
+# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+
+program_dir="build_64/bin"
+binName="llava-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
diff --git a/examples/llava/android/build_64.sh b/examples/llava/android/build_64.sh
new file mode 100755
index 000000000..71b6fd3f7
--- /dev/null
+++ b/examples/llava/android/build_64.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23 $1
+
+make -j4
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 2ae8853d3..6161fd858 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -12,6 +12,7 @@
 #include <regex>
 #include <stdexcept>
 #include <vector>
+#include <sstream>
 
 #include "clip.h"
 #include "ggml.h"
@@ -67,6 +68,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_PROJ_TYPE "clip.projector_type"
 
 //
 // tensor name constants
@@ -89,6 +91,21 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,           "mlp"     },
+    { PROJECTOR_TYPE_LDP,          "ldp"    },
+};
+
 
 //
 // utilities to get data from a gguf file
@@ -129,6 +146,91 @@ static std::string get_ftype(int ftype) {
     return ggml_type_name(static_cast<ggml_type>(ftype));
 }
 
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
+}
+
+static projector_type clip_projector_type_from_string(const std::string & name) {
+    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
 //
 // image data
 //
@@ -205,6 +307,32 @@ struct clip_vision_model {
     struct ggml_tensor * mm_0_b;
     struct ggml_tensor * mm_2_w;
     struct ggml_tensor * mm_2_b;
+
+    // MobileVLM projection
+    struct ggml_tensor * mm_model_mlp_1_w;
+    struct ggml_tensor * mm_model_mlp_1_b;
+    struct ggml_tensor * mm_model_mlp_3_w;
+    struct ggml_tensor * mm_model_mlp_3_b;
+    struct ggml_tensor * mm_model_block_1_block_0_0_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_1_block_2_0_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_b;
+    struct ggml_tensor * mm_model_block_2_block_0_0_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_2_block_2_0_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };
 
 struct clip_ctx {
@@ -213,6 +341,7 @@ struct clip_ctx {
     bool has_llava_projector = false;
 
     struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
 
     float image_mean[3];
     float image_std[3];
@@ -430,16 +559,135 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             free(patches_data);
         }
 
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
         embeddings = ggml_get_rows(ctx0, embeddings, patches);
 
-        // mm projection 0
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        // print_tensor_info(embeddings, "embeddings");
 
-        embeddings = ggml_gelu(ctx0, embeddings);
+        // llava projector
+        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            struct ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else {
+            GGML_ASSERT(false);
+        }
     }
 
     // build the graph
@@ -485,16 +733,55 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         printf("\n");
     }
     const int n_tensors = gguf_get_n_tensors(ctx);
+
     // kv
-    if (verbosity >= 3) {
-        const int n_kv = gguf_get_n_kv(ctx);
+    const int n_kv = gguf_get_n_kv(ctx);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+        __func__, n_kv, n_tensors, fname);
+    {
+        std::map<enum ggml_type, uint32_t> n_type;
 
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ctx, i);
+        uint32_t n_type_max = 0;
+        enum ggml_type type_max = GGML_TYPE_F32;
 
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+        for (int i = 0; i < n_tensors; i++) {
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
+
+            n_type[type]++;
+
+            if (n_type_max < n_type[type]) {
+                n_type_max = n_type[type];
+                type_max   = type;
+            }
+        }
+
+        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(ctx, i);
+            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(ctx, i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
-        printf("\n");
     }
 
     // data
@@ -503,20 +790,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         for (int i = 0; i < n_tensors; ++i) {
             const char * name = gguf_get_tensor_name(ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
             struct ggml_tensor * cur = ggml_get_tensor(meta, name);
             size_t tensor_size = ggml_nbytes(cur);
             buffer_size += tensor_size;
             if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i,
+                       ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type);
             }
         }
     }
 
+
+
     buffer_size += n_tensors * 128 /* CLIP PADDING */;
 
     clip_ctx * new_clip = new clip_ctx;
 
+    // update projector type
+    {
+        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
+        if (idx != -1) {
+            const std::string proj_type = gguf_get_val_str(ctx, idx);
+            new_clip->proj_type = clip_projector_type_from_string(proj_type);
+        }
+        else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+    }
+
 #ifdef GGML_USE_CUBLAS
     new_clip->backend = ggml_backend_cuda_init(0);
     printf("%s: CLIP using CUDA backend\n", __func__);
@@ -661,10 +963,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
         vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+
+        // LLaVA projection
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projection
+            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+        }
+        else {
+            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
+            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+        }
 
         vision_model.layers.resize(hparams.n_layer);
         for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1100,13 +1437,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }
 
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    return ctx->vision_model.mm_2_b->ne[0];
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        return ctx->vision_model.mm_2_b->ne[0];
+    }
+    else {
+        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+    }
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
     auto & params = ctx->vision_model.hparams;
-
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        n_patches /= 4;
+    }
+    return n_patches;
 }
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 03688e0ea..f5a3c9b46 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
     fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
     fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
 else:
     fout.add_description("two-tower CLIP model")
 
@@ -218,7 +221,8 @@ if has_llava_projector:
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
             data = data.squeeze().numpy().astype(np.float16)
         else:
             data = data.squeeze().numpy().astype(np.float32)
diff --git a/ggml.c b/ggml.c
index cbf2d4bdd..a7a88e382 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1418,6 +1418,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1776,9 +1779,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "GELU",
     "GELU_QUICK",
     "SILU",
+    "HARDSWISH",
+    "HARDSIGMOID",
 };
 
-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
+static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
 
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -3945,6 +3950,20 @@ struct ggml_tensor * ggml_silu_back(
     return result;
 }
 
+// ggml hardswish
+struct ggml_tensor * ggml_hardswish(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
+}
+
+// ggml hardsigmoid
+struct ggml_tensor * ggml_hardsigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
+}
+
 // ggml_norm
 
 static struct ggml_tensor * ggml_norm_impl(
@@ -5344,6 +5363,33 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     return result;
 }
 
+// ggml_conv_depthwise
+struct ggml_tensor * ggml_conv_depthwise_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    struct ggml_tensor * b,
+    struct ggml_tensor * c,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
+
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
 // ggml_conv_2d
 
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -9333,6 +9379,87 @@ static void ggml_compute_forward_silu_back(
     }
 }
 
+
+static void ggml_compute_forward_hardswish_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardswish_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void ggml_compute_forward_hardswish(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardswish_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardsigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+
 // ggml_compute_forward_norm
 
 static void ggml_compute_forward_norm_f32(
@@ -12349,6 +12476,7 @@ static void ggml_compute_forward_im2col(
     }
 }
 
+
 // ggml_compute_forward_conv_transpose_2d
 
 static void ggml_compute_forward_conv_transpose_2d(
@@ -13917,6 +14045,14 @@ static void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_silu(params, src0, dst);
             } break;
+        case GGML_UNARY_OP_HARDSWISH:
+            {
+                ggml_compute_forward_hardswish(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSIGMOID:
+            {
+                ggml_compute_forward_hardsigmoid(params, src0, dst);
+            } break;
         default:
             {
                 GGML_ASSERT(false);
@@ -16330,6 +16466,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_ELU:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
+                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                     {
                         n_tasks = 1;
                     } break;
@@ -16562,7 +16700,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // distribute new work or execute it direct if 1T
             while (++node_n < cgraph->n_nodes) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 const int n_tasks = ggml_get_n_tasks(node, n_threads);
 
diff --git a/ggml.h b/ggml.h
index de8162b81..dca7bd9ce 100644
--- a/ggml.h
+++ b/ggml.h
@@ -489,6 +489,8 @@ extern "C" {
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_HARDSWISH,
+        GGML_UNARY_OP_HARDSIGMOID,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -1032,6 +1034,16 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // hardswish(x) = x * relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardswish(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardsigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
@@ -1483,6 +1495,18 @@ extern "C" {
             int                  d1,
             bool                 is_2D);
 
+    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1);
+
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From 780e24a22eb595b705cbe8284771e9ceff1c4dd2 Mon Sep 17 00:00:00 2001
From: Reinforce-II <fate@eastal.com>
Date: Mon, 22 Jan 2024 21:15:08 +0800
Subject: [PATCH 109/138] ggml : parallelize FP32 conversion when using BLAS
 (#5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml.c | 198 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 150 insertions(+), 48 deletions(-)

diff --git a/ggml.c b/ggml.c
index a7a88e382..f85045c9c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7810,6 +7810,9 @@ static void ggml_compute_forward_acc_f32(
     bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
 
     if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
         // memcpy needs to be synchronized across threads to avoid race conditions.
         // => do it in INIT phase
         memcpy(
@@ -9952,11 +9955,30 @@ static void ggml_compute_forward_mul_mat(
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
-            return;
-        }
+        const int64_t ne_plane      = ne01*ne00;
+        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
+        UNUSED(desired_wsize);
 
         if (params->type == GGML_TASK_INIT) {
+            if (type != GGML_TYPE_F32) {
+                assert(params->wsize >= desired_wsize);
+                // parallelize by src0 rows
+                for (int64_t i13 = 0; i13 < ne13; i13++) {
+                    for (int64_t i12 = 0; i12 < ne12; i12++) {
+                        // broadcast src0 into src1 across 2nd,3rd dimension
+                        const int64_t i03 = i13/r3;
+                        const int64_t i02 = i12/r2;
+
+                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
+                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+                              ggml_to_float_t  const to_float = type_traits[type].to_float;
+
+                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
+                        }
+                    }
+                }
+            }
             return;
         }
 
@@ -9964,9 +9986,14 @@ static void ggml_compute_forward_mul_mat(
             return;
         }
 
+        // perform sgemm, parallelization controlled by blas lib
+        if (ith != 0) {
+            return;
+        }
+
+        const int64_t tgemm0 = ggml_perf_time_us();
         for (int64_t i13 = 0; i13 < ne13; i13++) {
             for (int64_t i12 = 0; i12 < ne12; i12++) {
-                // broadcast src0 into src1 across 2nd,3rd dimension
                 const int64_t i03 = i13/r3;
                 const int64_t i02 = i12/r2;
 
@@ -9975,17 +10002,7 @@ static void ggml_compute_forward_mul_mat(
                       float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
 
                 if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
-                    ggml_to_float_t const to_float = type_traits[type].to_float;
-
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                    x = wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                 }
 
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9995,6 +10012,7 @@ static void ggml_compute_forward_mul_mat(
                          0.0f,    d, ne01);
             }
         }
+        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
 
         //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
@@ -10003,6 +10021,9 @@ static void ggml_compute_forward_mul_mat(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         if (src1->type != vec_dot_type) {
             char * wdata = params->wdata;
             const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10167,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id(
     #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
 
    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         char * wdata = params->wdata;
         if (src1->type != vec_dot_type) {
             const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10352,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32(
             return;
         }
 #endif
+        if (ith != 0) {
+            return;
+        }
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
         return;
     }
@@ -10535,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32(
     // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
         return;
     }
@@ -10719,6 +10749,9 @@ static void ggml_compute_forward_set_f32(
     bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
 
     if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
         // memcpy needs to be synchronized across threads to avoid race conditions.
         // => do it in INIT phase
         memcpy(
@@ -11043,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
     // ggml_compute_forward_dup_same_cont(params, opt0, dst);
 
     if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
         memset(dst->data, 0, ggml_nbytes(dst));
     }
 
@@ -11077,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32(
     // ggml_compute_forward_dup_same_cont(params, opt0, dst);
 
     if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
         memset(dst->data, 0, ggml_nbytes(dst));
     }
 
@@ -11214,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32(
     GGML_ASSERT(n_past >= 0);
 
     if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (ith != 0) {
+            return;
+        }
         // memcpy needs to be synchronized across threads to avoid race conditions.
         // => do it in INIT phase
         GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12184,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         memset(params->wdata, 0, params->wsize);
 
         // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12278,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         memset(params->wdata, 0, params->wsize);
 
         // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12502,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d(
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
         memset(params->wdata, 0, params->wsize);
 
         // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -14116,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
 
     const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
     if (!inplace && params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
         memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
         return;
     }
@@ -16409,8 +16463,9 @@ struct ggml_compute_state_shared {
     const int n_threads;
 
     // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
+    atomic_int n_active;  // num active threads
+    atomic_int node_n;    // active graph node
+    atomic_int node_task; // active graph node task phase
 
     bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -16658,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
     return n_tasks;
 }
 
+static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_node_n = * node_n;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * node_n = atomic_load(&state->shared->node_n);
+        if (* node_n != last_node_n) break;
+    }
+}
+
+static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_task_phase = * task_phase;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * task_phase = atomic_load(&state->shared->node_task);
+        if (* task_phase != last_task_phase) break;
+    }
+}
+
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
@@ -16668,7 +16751,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
     set_numa_thread_affinity(state->ith, n_threads);
 
-    int node_n = -1;
+    int node_n     = -1;
+    int task_phase = GGML_TASK_FINALIZE;
 
     while (true) {
         if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16708,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
                 params.nth = n_tasks;
 
-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
                 if (n_tasks == 1) {
+                    /* INIT */
+                    if (GGML_OP_HAS_INIT[node->op]) {
+                        params.type = GGML_TASK_INIT;
+                        ggml_compute_forward(&params, node);
+                    }
+
                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                     // they do something more efficient than spinning (?)
                     params.type = GGML_TASK_COMPUTE;
@@ -16735,38 +16819,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 }
             }
 
-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
+            task_phase = GGML_TASK_INIT;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_n,    node_n);
+            atomic_store(&state->shared->node_task, task_phase);
         } else {
-            // wait for other threads to finish
-            const int last = node_n;
-
-            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
-            while (true) {
-                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-                //       depending on the workload and the operating system.
-                //       since it is not clear what is the best approach, it should potentially become user-configurable
-                //       ref: https://github.com/ggerganov/ggml/issues/291
-                // UPD:  adding the do_yield flag seems to resolve the issue universally
-                if (do_yield) {
-                    sched_yield();
-                }
-
-                node_n = atomic_load(&state->shared->node_n);
-                if (node_n != last) break;
-            };
+            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
         }
 
         // check if we should stop
         if (node_n >= cgraph->n_nodes) break;
 
-        /* COMPUTE */
+        /* INIT & COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
         const int n_tasks = ggml_get_n_tasks(node, n_threads);
 
         struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.type  =*/ GGML_TASK_INIT,
             /*.ith   =*/ state->ith,
             /*.nth   =*/ n_tasks,
             /*.wsize =*/ cplan->work_size,
@@ -16774,8 +16844,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         };
 
         if (state->ith < n_tasks) {
+            if (GGML_OP_HAS_INIT[node->op]) {
+                ggml_compute_forward(&params, node);
+            }
+        }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_COMPUTE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+            //       depending on the workload and the operating system.
+            //       since it is not clear what is the best approach, it should potentially become user-configurable
+            //       ref: https://github.com/ggerganov/ggml/issues/291
+            // UPD:  adding the do_yield flag seems to resolve the issue universally
+            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
+            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
+        }
+
+        if (state->ith < n_tasks) {
+            params.type = GGML_TASK_COMPUTE;
             ggml_compute_forward(&params, node);
         }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_FINALIZE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+        }
     }
 
     return GGML_EXIT_SUCCESS;
@@ -16832,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                     if (ggml_compute_forward_mul_mat_use_blas(node)) {
                         if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            // here we need memory for fully dequantized matrix from src0
+                            cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
                         }
                     } else
 #endif
@@ -16987,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
         /*.n_threads               =*/ n_threads,
         /*.n_active                =*/ n_threads,
         /*.node_n                  =*/ -1,
+        /*.node_task               =*/ GGML_TASK_FINALIZE,
         /*.abort_callback          =*/ NULL,
         /*.abort_callback_data     =*/ NULL,
     };

From 6f9939d119b2d004c264952eb510bd106455531e Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:10:14 +0200
Subject: [PATCH 110/138] KL-divergence (#5076)

* kl-divergence: be able to save all logits to a file

* Add ability to compute KL-divergence

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.cpp                  |   9 +
 common/common.h                    |   3 +
 examples/perplexity/perplexity.cpp | 319 ++++++++++++++++++++++++++++-
 3 files changed, 329 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 0e4b8bab2..0a7096171 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -672,6 +672,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                 params.logdir += DIRECTORY_SEPARATOR;
             }
+        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.logits_file = argv[i];
         } else if (arg == "--perplexity" || arg == "--all-logits") {
             params.logits_all = true;
         } else if (arg == "--ppl-stride") {
@@ -716,6 +722,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.multiple_choice_tasks = std::stoi(argv[i]);
+        } else if (arg == "--kl-divergence") {
+            params.kl_divergence = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -967,6 +975,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
     printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
     printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
index c69ad7e94..214a379b5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -91,6 +91,7 @@ struct gpt_params {
     std::string input_suffix      = "";  // string to suffix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
+    std::string logits_file       = "";  // file for saving *all* logits
 
     std::vector<llama_model_kv_override> kv_overrides;
 
@@ -111,6 +112,8 @@ struct gpt_params {
     bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
     size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
+    bool   kl_divergence   = false; // compute KL-divergence
+
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b7ef9a084..1b7f85f49 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
     return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 
+static inline int nearest_int(float fval) {
+    //assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
+    float max_logit = logits[0];
+    float min_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+        min_logit = std::min(min_logit, logits[i]);
+    }
+    min_logit = std::max(min_logit, max_logit - 16);
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float min_log_prob = min_logit - max_logit - log_sum_exp;
+    const float scale = (max_logit - min_logit)/65535.f;
+    float * d = (float *)log_prob;
+    d[0] = scale;
+    d[1] = min_log_prob;
+    log_prob += 4;
+    if (scale) {
+        const float inv_scale = 1/scale;
+        for (int i = 0; i < n_vocab; ++i) {
+            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
+        }
+    } else {
+        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
+    }
+    return max_logit + log_sum_exp - logits[tok];
+}
+
 static void process_logits(
     int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
     double & nll, double & nll2, float * logit_history, float * prob_history
@@ -147,6 +184,114 @@ static void process_logits(
     }
 }
 
+static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
+        double local_nll  = 0;
+        double local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            local_nll += v;
+            local_nll2 += v*v;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
+}
+
+struct kl_divergence_result {
+    double sum_nll  = 0;
+    double sum_nll2 = 0;
+    double sum_kld  = 0;
+    double sum_kld2 = 0;
+    double sum_nll_diff  = 0;
+    double sum_nll_diff2 = 0;
+    size_t count = 0;
+};
+
+static void log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float * d = (const float *)base_log_prob;
+    const float scale = d[0];
+    const float min_log_prob = d[1];
+    base_log_prob += 4;
+    float nll = max_logit + log_sum_exp - logits[tok];
+    kld.sum_nll  += nll;
+    kld.sum_nll2 += nll*nll;
+    nll += (scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_diff  += nll;
+    kld.sum_nll_diff2 += nll*nll;
+    max_logit += log_sum_exp;
+    double sum = 0;
+    for (int i = 0; i < n_vocab; ++i) {
+        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
+        if (p_log_base > -16.f) {
+            const float p_base = expf(p_log_base);
+            sum += p_base * (p_log_base - logits[i] + max_logit);
+        }
+    }
+    kld.sum_kld  += sum;
+    kld.sum_kld2 += sum*sum;
+    ++kld.count;
+}
+
+static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv] () {
+        kl_divergence_result local_kld;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                kld.sum_nll  += local_kld.sum_nll;
+                kld.sum_nll2 += local_kld.sum_nll2;
+                kld.sum_kld  += local_kld.sum_kld;
+                kld.sum_kld2 += local_kld.sum_kld2;
+                kld.sum_nll_diff  += local_kld.sum_nll_diff;
+                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
+                kld.count += local_kld.count;
+                break;
+            }
+            lock.unlock();
+            log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+}
+
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
     // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@@ -294,6 +439,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
     const int n_ctx = llama_n_ctx(ctx);
 
+    std::ofstream logits_stream;
+    if (!params.logits_file.empty()) {
+        logits_stream.open(params.logits_file.c_str());
+        if (!logits_stream.is_open()) {
+            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            return {};
+        }
+        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        logits_stream.write("_logits_", 8);
+        logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
+    }
+
     auto tim1 = std::chrono::high_resolution_clock::now();
     fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
 
@@ -336,6 +493,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
+    std::vector<uint16_t> log_probs;
+    if (!params.logits_file.empty()) {
+        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
+        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
+        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
+        const int nv = 2*((n_vocab + 1)/2) + 4;
+        log_probs.resize(n_ctx * nv);
+    }
+
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
@@ -398,8 +564,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         // process the entire prompt.
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        if (!params.logits_file.empty()) {
+            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, log_probs, nll, nll2);
+        } else {
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        }
         count += n_ctx - first - 1;
 
         // perplexity is e^(average negative log-likelihood)
@@ -1414,6 +1585,148 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     printf("\n");
 }
 
+static void kl_divergence(llama_context * ctx, const gpt_params & params) {
+    if (params.logits_file.empty()) {
+        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        return;
+    }
+    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
+    if (!in) {
+        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    {
+        char check[9]; check[8] = 0;
+        in.read(check, 8);
+        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
+            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            return;
+        }
+    }
+
+    uint32_t n_ctx;
+    in.read((char *)&n_ctx, sizeof(n_ctx));
+    if (n_ctx > llama_n_ctx(ctx)) {
+        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
+                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
+    }
+
+    int n_vocab, n_chunk;
+    in.read((char *)&n_vocab, sizeof(n_vocab));
+    in.read((char *)&n_chunk, sizeof(n_chunk));
+    if (in.fail()) {
+        fprintf(stderr, "%s: failed rwading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
+        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+    }
+
+    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
+        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+
+    const int n_batch = params.n_batch;
+    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
+    std::vector<float> logits;
+    if (num_batches > 1) {
+        logits.reserve(n_ctx * n_vocab);
+    }
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
+        if (count < 1) {
+            return std::make_pair(0., 0.);
+        }
+        double f = sum/count;
+        double df = sum2/count - f*f;
+        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
+        return std::make_pair(f, df);
+    };
+
+    kl_divergence_result kld;
+
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
+            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            return;
+        }
+
+        // clear the KV cache
+        llama_kv_cache_clear(ctx);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+            }
+
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            // restore the original token in case it was set to BOS
+            tokens[batch_start] = token_org;
+
+            if (num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
+        }
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+
+            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence\n");
+        }
+
+        const int first = n_ctx/2;
+        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                workers, log_probs_uint16, kld);
+
+        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
+        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+
+        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf\n", i+1, exp(ppl.first),
+                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second);
+
+        fflush(stdout);
+
+        logits.clear();
+    }
+    printf("\n");
+
+}
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -1476,6 +1789,8 @@ int main(int argc, char ** argv) {
         winogrande_score(ctx, params);
     } else if (params.multiple_choice) {
         multiple_choice_score(ctx, params);
+    } else if (params.kl_divergence) {
+        kl_divergence(ctx, params);
     } else {
         results = perplexity(ctx, params);
     }

From 011e8ec577fd135cbc02993d3ea9840c516d6a1c Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 22 Jan 2024 23:42:41 +0100
Subject: [PATCH 111/138] llama : fix not enough space in buffer with Qwen
 (#5086)

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 8c906a22f..f6f1ec0f4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4440,9 +4440,9 @@ static struct ggml_tensor * llm_build_kv(
 
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(graph, q_cur);
     ggml_build_forward_expand(graph, k_cur);
     ggml_build_forward_expand(graph, v_cur);
-    ggml_build_forward_expand(graph, q_cur);
 
     llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
 

From 125d03a5036a02a983c8e98c2cdc126e061afb8e Mon Sep 17 00:00:00 2001
From: Michael Coppola <m18coppola@gmail.com>
Date: Tue, 23 Jan 2024 01:51:27 -0500
Subject: [PATCH 112/138] llama.vim : added api key support (#5090)

Co-authored-by: Michael Coppola <info@michaeljcoppola.com>
---
 examples/llama.vim | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index f03fadfb7..1b5ad6ba0 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -6,7 +6,7 @@
 " Similarly, you could add an insert mode keybind with
 " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
 "
-" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
 " let g:llama_api_url = "192.168.1.10:8080"
 " llama_overrides can also be set through buffer/window scopes. For instance
 " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@@ -82,6 +82,9 @@ func llama#doLlamaGen()
    endif
    let l:querydata.prompt = join(l:buflines, "\n")
    let l:curlcommand = copy(s:curlcommand)
+   if exists("g:llama_api_key")
+       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
+   endif
    let l:curlcommand[2] = json_encode(l:querydata)
    let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
 endfunction

From 2bed4aa3f37cb4e39e16e9ec7b595a7738fd5faf Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 23 Jan 2024 08:11:39 +0100
Subject: [PATCH 113/138] devops : add intel oneapi dockerfile (#5068)

Co-authored-by: Xuan Son Nguyen <xuanson.nguyen@snowpack.eu>
---
 .devops/main-intel.Dockerfile | 26 ++++++++++++++++++++++++++
 .github/workflows/docker.yml  |  1 +
 CMakeLists.txt                |  5 +++++
 3 files changed, 32 insertions(+)
 create mode 100644 .devops/main-intel.Dockerfile

diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile
new file mode 100644
index 000000000..e1e6acc24
--- /dev/null
+++ b/.devops/main-intel.Dockerfile
@@ -0,0 +1,26 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+ARG UBUNTU_VERSION=22.04
+
+FROM intel/hpckit:$ONEAPI_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
+    cmake --build . --config Release --target main server
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/build/bin/main /main
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 87904b75e..825b8f503 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -35,6 +35,7 @@ jobs:
           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
     steps:
       - name: Check out the repo
         uses: actions/checkout@v3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a333ff52..af3665129 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -478,6 +478,11 @@ function(get_flags CCID CCVER)
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
         endif()
+    elseif (CCID MATCHES "Intel")
+        # enable max optimization level when using Intel compiler
+        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+        add_link_options(-fuse-ld=lld -static-intel)
     endif()
 
     set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)

From 89758723c75ba594e401f6513751beeba7ca1d28 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 23 Jan 2024 14:12:57 +0200
Subject: [PATCH 114/138] minor : clean-up some warnings and style (#5094)

* minor : clean-up some warnings and style

ggml-ci

* ggml : add comment
---
 common/common.cpp                  | 10 +++----
 examples/llava/clip.cpp            | 47 ++++++++++++------------------
 examples/perplexity/perplexity.cpp |  4 +--
 ggml.c                             |  9 +++---
 ggml.h                             |  1 -
 llama.cpp                          | 24 +++++++--------
 6 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 0a7096171..6b07f1197 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -216,12 +216,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             // store the external file name in params
             params.prompt_file = argv[i];
-            file.seekg(0, std::ios::end);
-            size_t size = file.tellg();
-            file.seekg(0, std::ios::beg);
-            params.prompt.resize(size);
-            file.read((char *)params.prompt.data(), size);
-            fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
         } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 6161fd858..4a0338a37 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2,18 +2,6 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
 
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <regex>
-#include <stdexcept>
-#include <vector>
-#include <sstream>
-
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -30,6 +18,19 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+#include <sstream>
+#include <cinttypes>
+
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -217,9 +218,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
 static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n",
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
-            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
 
 static projector_type clip_projector_type_from_string(const std::string & name) {
@@ -592,7 +593,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -640,7 +641,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // block_2
             {
                 // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
@@ -741,18 +742,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     {
         std::map<enum ggml_type, uint32_t> n_type;
 
-        uint32_t n_type_max = 0;
-        enum ggml_type type_max = GGML_TYPE_F32;
-
         for (int i = 0; i < n_tensors; i++) {
             enum ggml_type type = gguf_get_tensor_type(ctx, i);
 
             n_type[type]++;
-
-            if (n_type_max < n_type[type]) {
-                n_type_max = n_type[type];
-                type_max   = type;
-            }
         }
 
         printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
@@ -795,14 +788,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             size_t tensor_size = ggml_nbytes(cur);
             buffer_size += tensor_size;
             if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
             }
         }
     }
 
-
-
     buffer_size += n_tensors * 128 /* CLIP PADDING */;
 
     clip_ctx * new_clip = new clip_ctx;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 1b7f85f49..de6d3eb41 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1202,11 +1202,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
     printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 
-static bool deserialize_string(std::istream& in, std::string& str) {
+static bool deserialize_string(std::istream & in, std::string & str) {
     uint32_t size;
     if (!in.read((char *)&size, sizeof(size)).fail()) {
         str.resize(size);
-        if (!in.read((char *)str.data(), size).fail()) return true;
+        if (!in.read((char *)&str[0], size).fail()) return true;
     }
     return false;
 }
diff --git a/ggml.c b/ggml.c
index f85045c9c..ca98fde8a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5368,14 +5368,12 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
     struct ggml_context * ctx,
     struct ggml_tensor * a,
     struct ggml_tensor * b,
-    struct ggml_tensor * c,
     int                  s0,
     int                  s1,
     int                  p0,
     int                  p1,
     int                  d0,
     int                  d1) {
-
     struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
     struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                         ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
@@ -9991,7 +9989,7 @@ static void ggml_compute_forward_mul_mat(
             return;
         }
 
-        const int64_t tgemm0 = ggml_perf_time_us();
+        //const int64_t tgemm0 = ggml_perf_time_us();
         for (int64_t i13 = 0; i13 < ne13; i13++) {
             for (int64_t i12 = 0; i12 < ne12; i12++) {
                 const int64_t i03 = i13/r3;
@@ -16934,7 +16932,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     if (ggml_compute_forward_mul_mat_use_blas(node)) {
                         if (node->src[0]->type != GGML_TYPE_F32) {
                             // here we need memory for fully dequantized matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
+                            // take into account that src0 can be broadcasted into src1[2,3]
+                            cur = ggml_type_size(GGML_TYPE_F32)
+                                * node->src[0]->ne[0]*node->src[0]->ne[1]
+                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                         }
                     } else
 #endif
diff --git a/ggml.h b/ggml.h
index dca7bd9ce..1c4976271 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1499,7 +1499,6 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            struct ggml_tensor  * c,
             int                  s0,
             int                  s1,
             int                  p0,
diff --git a/llama.cpp b/llama.cpp
index f6f1ec0f4..582e82260 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2300,18 +2300,18 @@ struct llama_model_loader {
             }
 
             switch (type_max) {
-                case GGML_TYPE_F32:  ftype = LLAMA_FTYPE_ALL_F32;       break;
-                case GGML_TYPE_F16:  ftype = LLAMA_FTYPE_MOSTLY_F16;    break;
-                case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0;   break;
-                case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1;   break;
-                case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0;   break;
-                case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1;   break;
-                case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0;   break;
-                case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K;   break;
-                case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
-                case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
-                case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
-                case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
+                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
+                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
+                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
+                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
+                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
+                case GGML_TYPE_Q5_1:    ftype = LLAMA_FTYPE_MOSTLY_Q5_1;    break;
+                case GGML_TYPE_Q8_0:    ftype = LLAMA_FTYPE_MOSTLY_Q8_0;    break;
+                case GGML_TYPE_Q2_K:    ftype = LLAMA_FTYPE_MOSTLY_Q2_K;    break;
+                case GGML_TYPE_Q3_K:    ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M;  break;
+                case GGML_TYPE_Q4_K:    ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M;  break;
+                case GGML_TYPE_Q5_K:    ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M;  break;
+                case GGML_TYPE_Q6_K:    ftype = LLAMA_FTYPE_MOSTLY_Q6_K;    break;
                 case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
                 case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
                 default:

From 9ecdd12e95aee20d6dfaf5f5a0f0ce5ac1fb2747 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Tue, 23 Jan 2024 13:31:56 +0100
Subject: [PATCH 115/138] CUDA: more info when no device code (#5088)

---
 ggml-cuda.cu | 89 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index ec3837fb8..7f460449e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,6 +13,10 @@
 #include <map>
 #include <array>
 
+// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
 [[noreturn]]
-static __device__ void bad_arch() {
-    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    (void) arch_list;
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     __trap();
 
-    (void) bad_arch; // suppress unused function warning
+    (void) no_device_code; // suppress unused function warning
 }
 
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
@@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
     return a;
 #else
     (void) a;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
 
@@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
     return x;
 #else
     (void) x;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }
 
@@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
     }
 #else
     (void) vx; (void) y; (void) k;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_PASCAL
 }
 
@@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
     // second part effectively subtracts 8 from each quant value
     return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
     // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
     return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
     // second part effectively subtracts 16 from each quant value
     return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
     return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
 
     return d8_0*d8_1 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
     return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 
     return dm2f.x*sumf_d - dm2f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
 
     return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 
     return d3 * sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
 
     return d3*d8 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
     return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
     return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
     return dm5f.x*sumf_d - dm5f.y*sumf_m;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
     return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 
     return d*sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
     return d6 * sumf_d;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
     return dall * sumf_d - dmin * sumf_m;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 
 #endif
@@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
     return d * sumf_d;
 
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 
 #endif
@@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4704,7 +4723,7 @@ mul_mat_q5_1(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4838,7 +4857,7 @@ mul_mat_q2_K(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -5043,7 +5062,7 @@ mul_mat_q5_K(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
     }
 #else
     (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }
 

From 44879ee885f48ecf4675dd216b373dce0a6f3690 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:17:20 +0200
Subject: [PATCH 116/138] Additional KL-divergence statistics (#5081)

* perplexity: add top-token probability

* perplexity: add additional KL-divergence statistics

* perplexity: a better organized KL-divergence statistics output

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 71 +++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 10 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index de6d3eb41..8d2204969 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -222,13 +222,18 @@ struct kl_divergence_result {
     double sum_kld2 = 0;
     double sum_nll_diff  = 0;
     double sum_nll_diff2 = 0;
+    size_t n_same_top = 0;
     size_t count = 0;
 };
 
-static void log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
     float max_logit = logits[0];
+    int imax = 0;
     for (int i = 1; i < n_vocab; ++i) {
-        max_logit = std::max(max_logit, logits[i]);
+        if (logits[i] > max_logit) {
+            max_logit = logits[i];
+            imax = i;
+        }
     }
     double sum_exp = 0.0;
     for (int i = 0; i < n_vocab; ++i) {
@@ -247,8 +252,14 @@ static void log_softmax(int n_vocab, const float * logits, const uint16_t * base
     kld.sum_nll_diff2 += nll*nll;
     max_logit += log_sum_exp;
     double sum = 0;
+    int imax_base = -1;
+    float p_log_base_max = 0;
     for (int i = 0; i < n_vocab; ++i) {
         const float p_log_base = scale*base_log_prob[i] + min_log_prob;
+        if (i == 0 || p_log_base > p_log_base_max) {
+            p_log_base_max = p_log_base;
+            imax_base = i;
+        }
         if (p_log_base > -16.f) {
             const float p_base = expf(p_log_base);
             sum += p_base * (p_log_base - logits[i] + max_logit);
@@ -257,14 +268,17 @@ static void log_softmax(int n_vocab, const float * logits, const uint16_t * base
     kld.sum_kld  += sum;
     kld.sum_kld2 += sum*sum;
     ++kld.count;
+    if (imax == imax_base) ++kld.n_same_top;
+    return sum;
 }
 
 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
-        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld) {
+        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
+        float * kld_values) {
     std::mutex mutex;
     const int nv = 2*((n_vocab + 1)/2) + 4;
     int counter = 0;
-    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv] () {
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
         kl_divergence_result local_kld;
         while (true) {
             std::unique_lock<std::mutex> lock(mutex);
@@ -276,11 +290,13 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
                 kld.sum_kld2 += local_kld.sum_kld2;
                 kld.sum_nll_diff  += local_kld.sum_nll_diff;
                 kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
+                kld.n_same_top += local_kld.n_same_top;
                 kld.count += local_kld.count;
                 break;
             }
             lock.unlock();
-            log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i] = (float)v;
         }
     };
     for (auto & w : workers) {
@@ -1615,7 +1631,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     in.read((char *)&n_vocab, sizeof(n_vocab));
     in.read((char *)&n_chunk, sizeof(n_chunk));
     if (in.fail()) {
-        fprintf(stderr, "%s: failed rwading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
     if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
@@ -1634,6 +1650,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
 
     std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
+    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
     std::vector<float> logits;
     if (num_batches > 1) {
         logits.reserve(n_ctx * n_vocab);
@@ -1652,6 +1669,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     };
 
     kl_divergence_result kld;
+    auto kld_ptr = kld_values.data();
 
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * n_ctx;
@@ -1705,20 +1723,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
             }
             fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
 
-            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence\n");
+            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
         }
 
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
         process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld);
+                workers, log_probs_uint16, kld, kld_ptr);
+        kld_ptr += n_ctx - 1 - first;
 
         auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
         auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
         auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        auto p_top = 1.*kld.n_same_top/kld.count;
+        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
 
-        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf\n", i+1, exp(ppl.first),
-                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second);
+        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
+                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
+                p_top, d_p_top);
 
         fflush(stdout);
 
@@ -1726,6 +1748,35 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     }
     printf("\n");
 
+    if (kld.count < 100) return; // we do not wish to do statistics on so few values
+
+    std::sort(kld_values.begin(), kld_values.end());
+
+    printf("===== KL-divergence statistics\n");
+    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
+    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
+                                               : kld_values[kld_values.size()/2];
+    printf("Median : %10.6f\n", kld_median);
+
+    auto percentile = [&kld_values] (float fraction) {
+        if (fraction <= 0) return kld_values.front();
+        if (fraction >= 1) return kld_values.back();
+        float p = fraction*(kld_values.size() - 1);
+        size_t ip = size_t(p); p -= ip;
+        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
+    };
+
+    printf("Maximum: %10.6f\n", kld_values.back());
+    printf("KLD_99 : %10.6f\n", percentile(0.99f));
+    printf("KLD_95 : %10.6f\n", percentile(0.95f));
+    printf("KLD_90 : %10.6f\n", percentile(0.90f));
+
+    printf("Minimum: %10.6f\n", kld_values.front());
+    printf("KLD_01 : %10.6f\n", percentile(0.01f));
+    printf("KLD_05 : %10.6f\n", percentile(0.05f));
+    printf("KLD_10 : %10.6f\n", percentile(0.10f));
+
 }
 
 int main(int argc, char ** argv) {

From 26d607608d794efa56df3bdb6043a2f94c1d632c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 23 Jan 2024 15:50:56 +0200
Subject: [PATCH 117/138] metal : disable support for MUL_MAT F32 x F16

---
 ggml-metal.m | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 912ddc83f..4b3eb4914 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -668,7 +668,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
             return true;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction;
+            return ctx->support_simdgroup_reduction &&
+                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
         case GGML_OP_CPY:
         case GGML_OP_DUP:
         case GGML_OP_CONT:

From 1387ea21178f9f154944013d4dd9764b54c69deb Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 24 Jan 2024 12:48:14 +0100
Subject: [PATCH 118/138] llama : pre-allocate input tensors in a separate
 buffer (#5100)

---
 ggml-alloc.c |   4 +-
 llama.cpp    | 345 ++++++++++++++++++++++++---------------------------
 2 files changed, 167 insertions(+), 182 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 89b85d348..60141a34d 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
+                    __func__, tensor->name, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
             return;
         }
diff --git a/llama.cpp b/llama.cpp
index 582e82260..114046db9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1669,6 +1669,9 @@ struct llama_context {
         for (ggml_backend_t backend : backends) {
             ggml_backend_free(backend);
         }
+
+        ggml_backend_buffer_free(buf_input);
+        ggml_free(ctx_input);
     }
 
     llama_cparams cparams;
@@ -1715,8 +1718,14 @@ struct llama_context {
     // allocator for the input tensors
     ggml_tallocr * alloc = nullptr;
 
-    // temporary buffer for copying data to/from the backend
-    std::vector<no_init<uint8_t>> buf_copy;
+    // input tensors
+    ggml_backend_buffer_t buf_input = nullptr;
+    ggml_context * ctx_input = nullptr;
+    struct ggml_tensor * inp_tokens;    // I32 [n_batch]
+    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
+    struct ggml_tensor * inp_pos;       // I32 [n_batch]
+    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
+    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
@@ -4089,22 +4098,24 @@ static struct ggml_tensor * llm_build_inp_embd(
         const llama_hparams & hparams,
           const llama_batch & batch,
          struct ggml_tensor * tok_embd,
+         struct ggml_tensor * inp_tokens,
+         struct ggml_tensor * inp_embd,
          const llm_build_cb & cb) {
     const int64_t n_embd = hparams.n_embd;
 
     struct ggml_tensor * inpL;
 
     if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+        struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
         cb(inp_tokens, "inp_tokens", -1);
 
-        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
     } else {
 #ifdef GGML_USE_MPI
         GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+        inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
     }
 
     return inpL;
@@ -4118,6 +4129,7 @@ static void llm_build_k_shift(
       const llama_cparams & cparams,
      const llama_kv_cache & kv,
        struct ggml_cgraph * graph,
+       struct ggml_tensor * K_shift,
             llm_rope_type   type,
                   int64_t   n_ctx,
                   float     freq_base,
@@ -4134,9 +4146,6 @@ static void llm_build_k_shift(
     const float   beta_fast     = cparams.yarn_beta_fast;
     const float   beta_slow     = cparams.yarn_beta_slow;
 
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
-    cb(K_shift, "K_shift", -1);
-
     int rope_type = 0;
 
     switch (type) {
@@ -4457,6 +4466,7 @@ static struct ggml_tensor * llm_build_kv(
 
 struct llm_build_context {
     const llama_model    & model;
+    const llama_context  & lctx;
     const llama_hparams  & hparams;
     const llama_cparams  & cparams;
     const llama_batch    & batch;
@@ -4503,6 +4513,7 @@ struct llm_build_context {
     const llm_build_cb & cb,
                   bool   worst_case) :
         model            (lctx.model),
+        lctx             (lctx),
         hparams          (model.hparams),
         cparams          (lctx.cparams),
         batch            (batch),
@@ -4563,20 +4574,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4747,20 +4758,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4868,20 +4879,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4990,15 +5001,15 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -5087,19 +5098,19 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5294,11 +5305,11 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5384,11 +5395,11 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         inpL = llm_build_norm(ctx0, inpL, hparams,
@@ -5477,11 +5488,11 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5573,20 +5584,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5696,20 +5707,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5810,20 +5821,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -5931,20 +5942,20 @@ struct llm_build_context {
         struct ggml_tensor * ffn_output;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -6053,20 +6064,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -6160,15 +6171,15 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -6258,20 +6269,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
         cb(inp_pos, "inp_pos", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
         if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -6365,15 +6376,7 @@ static struct ggml_cgraph * llama_build_graph(
     // check if we should build the worst-case graph (for memory measurement)
     const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
 
-    // keep track of the input that has already been allocated
-    bool alloc_inp_tokens   = false;
-    bool alloc_inp_embd     = false;
-    bool alloc_inp_pos      = false;
-    bool alloc_inp_KQ_mask  = false;
-    bool alloc_inp_K_shift  = false;
-
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-    // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
         if (il >= 0) {
             ggml_format_name(cur, "%s-%d", name, il);
@@ -6381,127 +6384,79 @@ static struct ggml_cgraph * llama_build_graph(
             ggml_set_name(cur, name);
         }
 
-
         if (!lctx.cparams.offload_kqv) {
             if (strcmp(name, "kqv_merged_cont") == 0) {
                 // all nodes between the KV store and the attention output are run on the CPU
                 ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
             }
         }
-
-        //
-        // allocate input tensors and set input data
-        //
-
-        if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
-            ggml_tallocr_alloc(lctx.alloc, cur);
-
-            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
-                const int64_t n_tokens = cur->ne[0];
-
-                ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
-            }
-
-            alloc_inp_tokens = true;
-        }
-
-        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
-            ggml_tallocr_alloc(lctx.alloc, cur);
-
-            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
-                const int64_t n_embd   = cur->ne[0];
-                const int64_t n_tokens = cur->ne[1];
-
-                ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
-            }
-
-            alloc_inp_embd = true;
-        }
-
-        if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
-            ggml_tallocr_alloc(lctx.alloc, cur);
-
-            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
-                const int64_t n_tokens = cur->ne[0];
-
-                static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
-                ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
-            }
-
-            alloc_inp_pos = true;
-        }
-
-        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
-            ggml_tallocr_alloc(lctx.alloc, cur);
-
-            if (!ggml_tallocr_is_measure(lctx.alloc)) {
-                const int64_t n_kv     = cur->ne[0];
-                const int64_t n_tokens = cur->ne[1];
-
-                float * data;
-                if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    data = (float *) cur->data;
-                } else {
-                    lctx.buf_copy.resize(ggml_nbytes(cur));
-                    data = (float *) lctx.buf_copy.data();
-                }
-
-                for (int h = 0; h < 1; ++h) {
-                    for (int j = 0; j < n_tokens; ++j) {
-                        const llama_pos    pos    = batch.pos[j];
-                        const llama_seq_id seq_id = batch.seq_id[j][0];
-
-                        for (int i = 0; i < n_kv; ++i) {
-                            float f;
-                            if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
-                                f = -INFINITY;
-                            } else {
-                                f = 0;
-                            }
-                            data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
-                        }
-                    }
-                }
-
-                if (data != cur->data) {
-                    ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
-                }
-            }
-
-            alloc_inp_KQ_mask = true;
-        }
-
-        if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
-            ggml_tallocr_alloc(lctx.alloc, cur);
-
-            if (!ggml_tallocr_is_measure(lctx.alloc)) {
-                const int64_t n_ctx = cur->ne[0];
-
-                int32_t * data;
-                if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    data = (int32_t *) cur->data;
-                } else {
-                    lctx.buf_copy.resize(ggml_nbytes(cur));
-                    data = (int32_t *) lctx.buf_copy.data();
-                }
-
-                for (int i = 0; i < n_ctx; ++i) {
-                    data[i] = lctx.kv_self.cells[i].delta;
-                }
-
-                if (data != cur->data) {
-                    ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
-                }
-            }
-
-            alloc_inp_K_shift = true;
-        }
     };
 
     struct ggml_cgraph * result = NULL;
 
     struct llm_build_context llm(lctx, batch, cb, worst_case);
 
+    //
+    // set input data
+    //
+
+    if (!ggml_tallocr_is_measure(lctx.alloc)) {
+        if (batch.token) {
+            const int64_t n_tokens = batch.n_tokens;
+
+            ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+        }
+
+        if (batch.embd) {
+            const int64_t n_embd   = llm.n_embd;
+            const int64_t n_tokens = batch.n_tokens;
+
+            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+        }
+
+        if (batch.pos) {
+            const int64_t n_tokens = batch.n_tokens;
+
+            ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+        }
+
+        {
+            const int64_t n_kv     = llm.n_kv;
+            const int64_t n_tokens = batch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+            float * data = (float *) lctx.inp_KQ_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int j = 0; j < n_tokens; ++j) {
+                    const llama_pos    pos    = batch.pos[j];
+                    const llama_seq_id seq_id = batch.seq_id[j][0];
+
+                    for (int i = 0; i < n_kv; ++i) {
+                        float f;
+                        if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+                            f = -INFINITY;
+                        } else {
+                            f = 0;
+                        }
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+                    }
+                }
+            }
+        }
+
+        if (llm.do_rope_shift) {
+            const int64_t n_ctx = llm.n_ctx;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
+            int32_t * data = (int32_t *) lctx.inp_K_shift->data;
+
+            for (int i = 0; i < n_ctx; ++i) {
+                data[i] = lctx.kv_self.cells[i].delta;
+            }
+        }
+    }
+
     llm.init();
 
     switch (model.arch) {
@@ -9964,6 +9919,35 @@ struct llama_context * llama_new_context_with_model(
             ctx->embedding.resize(hparams.n_embd);
         }
 
+        // graph inputs
+        {
+            ggml_init_params init_params = {
+                /* .mem_size   */ ggml_tensor_overhead()*5,
+                /* .mem_buffer */ nullptr,
+                /* .no_alloc   */ true,
+            };
+            ctx->ctx_input = ggml_init(init_params);
+
+            ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
+            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
+            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+
+            ggml_set_name(ctx->inp_tokens,  "inp_tokens");
+            ggml_set_name(ctx->inp_embd,    "inp_embd");
+            ggml_set_name(ctx->inp_pos,     "inp_pos");
+            ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
+            ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
+
+            ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
+
+            LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name(ctx->buf_input),
+                    ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
+        }
+
+        // scheduler and compute buffers
         {
             // buffer types used for the compute buffer of each backend
             std::vector<ggml_backend_buffer_type_t> backend_buft;
@@ -9990,9 +9974,6 @@ struct llama_context * llama_new_context_with_model(
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_init_measure(ctx->sched, gf);
-            // note: the number of splits during measure is higher than during inference due to the kv shift
-            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
             ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
 
             for (ggml_backend_t backend : ctx->backends) {
@@ -10001,6 +9982,10 @@ struct llama_context * llama_new_context_with_model(
                         ggml_backend_buffer_name(buf),
                         ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
             }
+
+            // note: the number of splits during measure is higher than during inference due to the kv shift
+            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
+            LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
         }
     }
 

From bf63d695b804b1c995c7ae4427a8a86936ea6d25 Mon Sep 17 00:00:00 2001
From: Michael Hueschen <m@mhueschen.dev>
Date: Mon, 22 Jan 2024 03:17:05 -0700
Subject: [PATCH 119/138] nix: add cc to devShell LD_LIBRARY_PATH

this fixes the error I encountered when trying to run the convert.py
script in a venv:

```
$ nix develop

[...]$ source .venv/bin/activate
(.venv)
[...]$ pip3 install -r requirements.txt
<... clipped ...>
[...]$ python3 ./convert.py
Traceback (most recent call last):
  File "/home/mhueschen/projects-reference/llama.cpp/./convert.py", line 40, in <module>
    from sentencepiece import SentencePieceProcessor
  File "/home/mhueschen/projects-reference/llama.cpp/.venv/lib/python3.11/site-packages/sentencepiece/__init__.py", line 13, in <module>
    from . import _sentencepiece
ImportError: libstdc++.so.6: cannot open shared object file: No such file or directory
```

however, I am not sure this is the cleanest way to address this linker
issue...
---
 .devops/nix/package.nix | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index c25d99f01..91ddb8890 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -221,10 +221,16 @@ effectiveStdenv.mkDerivation (
         ;
 
       shell = mkShell {
+        NIX_LD_LIBRARY_PATH = lib.makeLibraryPath [
+          effectiveStdenv.cc.cc
+        ];
         name = "shell-${finalAttrs.finalPackage.name}";
         description = "contains numpy and sentencepiece";
         buildInputs = [ llama-python ];
         inputsFrom = [ finalAttrs.finalPackage ];
+        shellHook = ''
+          export LD_LIBRARY_PATH=$NIX_LD_LIBRARY_PATH
+        '';
       };
 
       shell-extra = mkShell {

From c9b316c78fba31e65879a2ec91cbafd341b88cce Mon Sep 17 00:00:00 2001
From: Michael Hueschen <m@mhueschen.dev>
Date: Mon, 22 Jan 2024 16:44:10 -0700
Subject: [PATCH 120/138] nix-shell: use addToSearchPath

thx to @SomeoneSerge for the suggestion!
---
 .devops/nix/package.nix | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 91ddb8890..a868a9a61 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -221,15 +221,12 @@ effectiveStdenv.mkDerivation (
         ;
 
       shell = mkShell {
-        NIX_LD_LIBRARY_PATH = lib.makeLibraryPath [
-          effectiveStdenv.cc.cc
-        ];
         name = "shell-${finalAttrs.finalPackage.name}";
         description = "contains numpy and sentencepiece";
         buildInputs = [ llama-python ];
         inputsFrom = [ finalAttrs.finalPackage ];
         shellHook = ''
-          export LD_LIBRARY_PATH=$NIX_LD_LIBRARY_PATH
+          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
         '';
       };
 

From cd4fddb29f81d6a1f6d51a0c016bc6b486d68def Mon Sep 17 00:00:00 2001
From: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:18:15 -0600
Subject: [PATCH 121/138] cuda : fix 2-bit quants on amd hip (#5105)

* cuda : fix 2-bit quants on amd hip

* use __low2float intrinsic function for new quants
---
 ggml-cuda.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7f460449e..05e5d18ab 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4283,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
         q8 += 8;
         aux32 >>= 7;
     }
-    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
     return d * sumi;
 #else
     // iqs is 0...15
@@ -4294,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
     const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
     const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
     const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
     const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
     const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
     const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@@ -4339,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
         }
         q8 += 8;
     }
-    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
     return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 #else
     assert(false);

From ddc5a5033f948dc7ab0a3a6ec2d914d13c274077 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 25 Jan 2024 11:26:17 +0200
Subject: [PATCH 122/138] metal : show compile log messages

---
 ggml-metal.m | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 4b3eb4914..60fef1a19 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -277,6 +277,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
             NSURL * libURL = [NSURL fileURLWithPath:libPath];
             GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
             ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
         } else {
             GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
@@ -315,13 +319,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                 //[options setFastMathEnabled:false];
 
                 ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return NULL;
+                }
             }
         }
-
-        if (error) {
-            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-            return NULL;
-        }
     }
 
     // print MTL GPU family:

From faa3526a1eba458120987ed8269e5616385a76f4 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:58:53 +0200
Subject: [PATCH 123/138] Fix Q3_K_XS for MoE models (#5113)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 114046db9..6a7506e85 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8829,6 +8829,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
     auto use_more_bits = [](int i_layer, int num_layers) -> bool {
         return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
     };
+    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+        if (n_expert > 1) {
+            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
+            // for getting the current layer as I initially thought, and we need to resort to parsing the
+            // tensor name.
+            n_layer /= n_expert;
+            if (sscanf(name, "blk.%d.", &i_layer) != 1) {
+                throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+            }
+            if (i_layer < 0 || i_layer >= n_layer) {
+                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+            }
+        }
+        return std::make_pair(i_layer, n_layer);
+    };
 
     if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
         int nx = tensor->ne[0];
@@ -8890,24 +8907,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             new_type = GGML_TYPE_Q2_K;
         }
     } else if (name.find("ffn_down") != std::string::npos) {
-        const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
-        int i_layer, n_layer;
-        if (n_expert == 1) {
-            i_layer = qs.i_ffn_down;
-            n_layer = qs.n_ffn_down;
-        } else {
-            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
-            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
-            // for getting the current layer as I initially thought, and we need to resort to parsing the
-            // tensor name.
-            n_layer = qs.n_ffn_down / n_expert;
-            if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
-                throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
-            }
-            if (i_layer < 0 || i_layer >= n_layer) {
-                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
-            }
-        }
+        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
             if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
@@ -8963,13 +8964,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
+        auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
             new_type = GGML_TYPE_Q2_K;
         }
         ++qs.i_ffn_gate;
     }
     else if (name.find("ffn_up") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
+        auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
             new_type = GGML_TYPE_Q2_K;
         }
         ++qs.i_ffn_up;

From 256d1bb0ddce6a0a21f5a7503019bdd5c1933cba Mon Sep 17 00:00:00 2001
From: Valentin Konovalov <valle.ketsujin@gmail.com>
Date: Thu, 25 Jan 2024 12:05:51 -0500
Subject: [PATCH 124/138] android : use release cmake build type by default
 (#5123)

---
 examples/llama.android/app/build.gradle.kts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
index 7815a8025..aadbe22c9 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -30,6 +30,7 @@ android {
         }
         externalNativeBuild {
             cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
                 cppFlags += listOf()
                 arguments += listOf()
             }

From d292f4f2047963f558dd516f1baaa71793e9acf2 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 25 Jan 2024 14:51:24 -0500
Subject: [PATCH 125/138] examples : make pydantic scripts pass mypy and
 support py3.8 (#5099)

---
 .../pydantic-models-to-grammar-examples.py    |  43 +----
 examples/pydantic_models_to_grammar.py        | 166 +++++++++---------
 2 files changed, 88 insertions(+), 121 deletions(-)

diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py
index cbf376652..160966649 100644
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -1,14 +1,14 @@
 # Function calling example using pydantic models.
 import datetime
+import importlib
 import json
 from enum import Enum
-from typing import Union, Optional
+from typing import Optional, Union
 
 import requests
 from pydantic import BaseModel, Field
-
-import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
+from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
+                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
 
 
 # Function to get completion on the llama.cpp server with grammar.
@@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
         print(self.message)
 
 
-# Enum for the calculator function.
+# Enum for the calculator tool.
 class MathOperation(Enum):
     ADD = "add"
     SUBTRACT = "subtract"
@@ -43,7 +43,7 @@ class MathOperation(Enum):
     DIVIDE = "divide"
 
 
-# Very simple calculator tool for the agent.
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
     """
     Perform a math operation on two numbers.
@@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
     return datetime.datetime.now().strftime(output_format)
 
 
-# Enum for the calculator tool.
-class MathOperation(Enum):
-    ADD = "add"
-    SUBTRACT = "subtract"
-    MULTIPLY = "multiply"
-    DIVIDE = "divide"
-
-
-
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
-class Calculator(BaseModel):
-    """
-    Perform a math operation on two numbers.
-    """
-    number_one: Union[int, float] = Field(..., description="First number.")
-    operation: MathOperation = Field(..., description="Math operation to perform.")
-    number_two: Union[int, float] = Field(..., description="Second number.")
-
-    def run(self):
-        if self.operation == MathOperation.ADD:
-            return self.number_one + self.number_two
-        elif self.operation == MathOperation.SUBTRACT:
-            return self.number_one - self.number_two
-        elif self.operation == MathOperation.MULTIPLY:
-            return self.number_one * self.number_two
-        elif self.operation == MathOperation.DIVIDE:
-            return self.number_one / self.number_two
-        else:
-            raise ValueError("Unknown operation.")
-
-
 # Example function to get the weather
 def get_current_weather(location, unit):
     """Get the current weather in a given location"""
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 848c1c367..9acc7cc6d 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -1,15 +1,21 @@
+from __future__ import annotations
+
 import inspect
 import json
+import re
 from copy import copy
-from inspect import isclass, getdoc
-from types import NoneType
+from enum import Enum
+from inspect import getdoc, isclass
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
 
 from docstring_parser import parse
-from pydantic import BaseModel, create_model, Field
-from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
-from enum import Enum
-from typing import get_type_hints, Callable
-import re
+from pydantic import BaseModel, Field, create_model
+
+if TYPE_CHECKING:
+    from types import GenericAlias
+else:
+    # python 3.8 compat
+    from typing import _GenericAlias as GenericAlias
 
 
 class PydanticDataType(Enum):
@@ -43,7 +49,7 @@ class PydanticDataType(Enum):
     SET = "set"
 
 
-def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
+def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
     if isclass(pydantic_type) and issubclass(pydantic_type, str):
         return PydanticDataType.STRING.value
     elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
@@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
 
     elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
         return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) == list:
+    elif get_origin(pydantic_type) is list:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) == set:
+    elif get_origin(pydantic_type) is set:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) == Union:
+    elif get_origin(pydantic_type) is Union:
         union_types = get_args(pydantic_type)
         union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
         return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) == Optional:
+    elif get_origin(pydantic_type) is Optional:
         element_type = get_args(pydantic_type)[0]
         return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
     elif isclass(pydantic_type):
         return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) == dict:
+    elif get_origin(pydantic_type) is dict:
         key_type, value_type = get_args(pydantic_type)
         return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
     else:
@@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
         return f"{cls.__name__.lower()} ::= " + " | ".join(members)
     if cls.__annotations__ and cls.__annotations__ != {}:
         result = f'{rule_name} ::= "{{"'
-        type_list_rules = []
         # Modify this comprehension
         members = [
             f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
@@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name):
 
         result += '"," '.join(members)
         result += '  "}"'
-        return result, type_list_rules
-    elif rule_name == "custom-class-any":
+        return result
+    if rule_name == "custom-class-any":
         result = f"{rule_name} ::= "
         result += "value"
-        type_list_rules = []
-        return result, type_list_rules
-    else:
-        init_signature = inspect.signature(cls.__init__)
-        parameters = init_signature.parameters
-        result = f'{rule_name} ::=  "{{"'
-        type_list_rules = []
-        # Modify this comprehension too
-        members = [
-            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
-            for name, param in parameters.items()
-            if name != "self" and param.annotation != inspect.Parameter.empty
-        ]
+        return result
 
-        result += '", "'.join(members)
-        result += '  "}"'
-        return result, type_list_rules
+    init_signature = inspect.signature(cls.__init__)
+    parameters = init_signature.parameters
+    result = f'{rule_name} ::=  "{{"'
+    # Modify this comprehension too
+    members = [
+        f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+        for name, param in parameters.items()
+        if name != "self" and param.annotation != inspect.Parameter.empty
+    ]
+
+    result += '", "'.join(members)
+    result += '  "}"'
+    return result
 
 
 def regex_to_gbnf(regex_pattern: str) -> str:
@@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
 
 def generate_gbnf_rule_for_type(
     model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
-) -> Tuple[str, list]:
+) -> tuple[str, list[str]]:
     """
     Generate GBNF rule for a given field type.
 
@@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
     :param field_info: Additional information about the field (optional).
 
     :return: Tuple containing the GBNF type and a list of additional rules.
-    :rtype: Tuple[str, list]
+    :rtype: tuple[str, list]
     """
     rules = []
 
@@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
         gbnf_type, rules = model_name + "-" + field_name, rules
 
     elif gbnf_type.startswith("custom-class-"):
-        nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
-        rules.append(nested_model_rules)
+        rules.append(get_members_structure(field_type, gbnf_type))
     elif gbnf_type.startswith("custom-dict-"):
         key_type, value_type = get_args(field_type)
 
@@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
         union_rules = []
 
         for union_type in union_types:
-            if isinstance(union_type, _GenericAlias):
+            if isinstance(union_type, GenericAlias):
                 union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                     model_name, field_name, union_type, False, processed_models, created_rules
                 )
                 union_rules.append(union_gbnf_type)
                 rules.extend(union_rules_list)
 
-            elif not issubclass(union_type, NoneType):
+            elif not issubclass(union_type, type(None)):
                 union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                     model_name, field_name, union_type, False, processed_models, created_rules
                 )
@@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
     else:
         gbnf_type, rules = gbnf_type, []
 
-    if gbnf_type not in created_rules:
-        return gbnf_type, rules
-    else:
-        if gbnf_type in created_rules:
-            return gbnf_type, rules
+    return gbnf_type, rules
 
 
-def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
+def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
     """
 
     Generate GBnF Grammar
@@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
     ```
     """
     if model in processed_models:
-        return []
+        return [], False
 
     processed_models.add(model)
     model_name = format_model_and_field_name(model.__name__)
@@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
 
 
 def generate_gbnf_grammar_from_pydantic_models(
-    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
     list_of_outputs: bool = False
 ) -> str:
     """
@@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
     * grammar.
 
     Args:
-        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
         outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
         outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
         list_of_outputs (str, optional): Allows a list of output objects
@@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
         # root ::= UserModel | PostModel
         # ...
     """
-    processed_models = set()
+    processed_models: set[type[BaseModel]] = set()
     all_rules = []
-    created_rules = {}
+    created_rules: dict[str, list[str]] = {}
     if outer_object_name is None:
         for model in models:
             model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
@@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
     Returns:
         str: GBNF primitive grammar string.
     """
-    type_list = []
+    type_list: list[type[object]] = []
     if "string-list" in grammar:
         type_list.append(str)
     if "boolean-list" in grammar:
@@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
 
 
 def generate_markdown_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
     documentation_with_field_description=True
 ) -> str:
     """
     Generate markdown documentation for a list of Pydantic models.
 
     Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
         model_prefix (str): Prefix for the model section.
         fields_prefix (str): Prefix for the fields section.
         documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -731,7 +729,7 @@ def generate_markdown_documentation(
 
 
 def generate_field_markdown(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
     documentation_with_field_description=True
 ) -> str:
     """
@@ -739,8 +737,8 @@ def generate_field_markdown(
 
     Args:
         field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
         depth (int): Indentation depth in the documentation.
         documentation_with_field_description (bool): Include field descriptions in the documentation.
 
@@ -798,7 +796,7 @@ def generate_field_markdown(
     return field_text
 
 
-def format_json_example(example: dict, depth: int) -> str:
+def format_json_example(example: dict[str, Any], depth: int) -> str:
     """
     Format a JSON example into a readable string with indentation.
 
@@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
 
 
 def generate_text_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
     documentation_with_field_description=True
 ) -> str:
     """
     Generate text documentation for a list of Pydantic models.
 
     Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
         model_prefix (str): Prefix for the model section.
         fields_prefix (str): Prefix for the fields section.
         documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -885,7 +883,7 @@ def generate_text_documentation(
 
 
 def generate_field_text(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
     documentation_with_field_description=True
 ) -> str:
     """
@@ -893,8 +891,8 @@ def generate_field_text(
 
     Args:
         field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
         depth (int): Indentation depth in the documentation.
         documentation_with_field_description (bool): Include field descriptions in the documentation.
 
@@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
     pydantic_model_list,
     grammar_file_path="./generated_grammar.gbnf",
     documentation_file_path="./generated_grammar_documentation.md",
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
     model_prefix: str = "Output Model",
     fields_prefix: str = "Output Fields",
     list_of_outputs: bool = False,
@@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
 
 def generate_gbnf_grammar_and_documentation(
     pydantic_model_list,
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
     model_prefix: str = "Output Model",
     fields_prefix: str = "Output Fields",
     list_of_outputs: bool = False,
@@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
 
 
 def generate_gbnf_grammar_and_documentation_from_dictionaries(
-    dictionaries: List[dict],
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    dictionaries: list[dict[str, Any]],
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
     model_prefix: str = "Output Model",
     fields_prefix: str = "Output Fields",
     list_of_outputs: bool = False,
@@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
     Generate GBNF grammar and documentation from a list of dictionaries.
 
     Args:
-        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        dictionaries (list[dict]): List of dictionaries representing Pydantic models.
         outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
         outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
         model_prefix (str): Prefix for the model section in the documentation.
@@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
     return grammar, documentation
 
 
-def create_dynamic_model_from_function(func: Callable):
+def create_dynamic_model_from_function(func: Callable[..., Any]):
     """
     Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
 
@@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
     sig = inspect.signature(func)
 
     # Parse the docstring
+    assert func.__doc__ is not None
     docstring = parse(func.__doc__)
 
     dynamic_fields = {}
@@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
                 f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
 
         # Add parameter details to the schema
-        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
         param_docs.append((param.name, param_doc))
         if param.default == inspect.Parameter.empty:
             default_value = ...
@@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
         dynamic_fields[param.name] = (
             param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]
 
-    for param_doc in param_docs:
-        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
+    for name, param_doc in param_docs:
+        dynamic_model.model_fields[name].description = param_doc.description
 
     dynamic_model.__doc__ = docstring.short_description
 
@@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
     return dynamic_model
 
 
-def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
+def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
     """
     Add a 'run' method to a dynamic Pydantic model, using the provided function.
 
     Args:
-        model (Type[BaseModel]): Dynamic Pydantic model class.
+        model (type[BaseModel]): Dynamic Pydantic model class.
         func (Callable): Function to be added as a 'run' method to the model.
 
     Returns:
-        Type[BaseModel]: Pydantic model class with the added 'run' method.
+        type[BaseModel]: Pydantic model class with the added 'run' method.
     """
 
     def run_method_wrapper(self):
@@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
     return model
 
 
-def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
+def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
     """
     Create a list of dynamic Pydantic model classes from a list of dictionaries.
 
     Args:
-        dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (list[dict]): List of dictionaries representing model structures.
 
     Returns:
-        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
     """
     dynamic_models = []
     for func in dictionaries:
@@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
     return Enum(enum_name, {value: value for value in values})
 
 
-def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
     """
     Convert a dictionary to a Pydantic model class.
 
@@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
         model_name (str): Name of the generated Pydantic model.
 
     Returns:
-        Type[BaseModel]: Generated Pydantic model class.
+        type[BaseModel]: Generated Pydantic model class.
     """
-    fields = {}
+    fields: dict[str, Any] = {}
 
     if "properties" in dictionary:
         for field_name, field_data in dictionary.get("properties", {}).items():
@@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
                     if items != {}:
                         array = {"properties": items}
                         array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
                     else:
                         fields[field_name] = (list, ...)
                 elif field_type == "object":

From 5eaf9964fc797d4585c214db32a463d557f3ed33 Mon Sep 17 00:00:00 2001
From: l3utterfly <gc.pthzfoldr@gmail.com>
Date: Fri, 26 Jan 2024 05:06:22 +0900
Subject: [PATCH 126/138] llama : dynamic temperature sampling (#4972)

* implemented dynamic temperature sampling from koboldcpp

* removed trailing whitespace

* removed unused temp parameter in llama_sample_entropy

* exposed exponent_val in dynamic temp sampler

* added debug check for printf statements

* use nullptr in llama_sample_softmax call during llama_sample_entropy

this avoids counting the time taken stats twice

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* return earlier if there is only 1 candiate (i.e. max_entropy == 0)

* reformat 't' case in llama_sample_queue

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* check for one or zero candidates case in llama_sample_entropy

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
---
 common/sampling.cpp | 12 +++++++-
 common/sampling.h   |  2 ++
 llama.cpp           | 67 +++++++++++++++++++++++++++++++++++++++++++++
 llama.h             |  8 ++++++
 4 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index dd1ffeb1b..efd7eab6e 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -129,6 +129,8 @@ static void sampler_queue(
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
     const float         temp              = params.temp;
+    const float         dynatemp_range    = params.dynatemp_range;
+    const float         dynatemp_exponent = params.dynatemp_exponent;
     const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
     const float         top_p             = params.top_p;
     const float         min_p             = params.min_p;
@@ -143,7 +145,15 @@ static void sampler_queue(
             case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
             case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
             case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            case 't':
+                if (dynatemp_range > 0) {
+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                } else {
+                    llama_sample_temp(ctx_main, &cur_p, temp);
+                }
+                break;
             default : break;
         }
     }
diff --git a/common/sampling.h b/common/sampling.h
index 2ee180376..88899c094 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
     float       tfs_z                 = 1.00f;    // 1.0 = disabled
     float       typical_p             = 1.00f;    // 1.0 = disabled
     float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
     int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float       penalty_repeat        = 1.10f;    // 1.0 = disabled
     float       penalty_freq          = 0.00f;    // 0.0 = disabled
diff --git a/llama.cpp b/llama.cpp
index 6a7506e85..823d42d7f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
     }
 }
 
+void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // no need to do anything if there is only one (or zero) candidates
+    if(candidates_p->size <= 1) {
+        return;
+    }
+
+    // Calculate maximum possible entropy
+    float max_entropy = -logf(1.0f / candidates_p->size);
+
+    llama_sample_softmax(nullptr, candidates_p);
+
+    // Calculate entropy of the softmax probabilities
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        float prob = candidates_p->data[i].p;
+        if (prob > 0.0f) { // Ensure no log(0)
+            entropy -= prob * logf(prob);
+        }
+    }
+
+    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
+    float normalized_entropy = entropy / max_entropy;
+
+    // Map the normalized entropy to the desired temperature range using the power function
+    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+#ifdef DEBUG
+    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+#endif
+
+    // Apply the dynamically calculated temperature scaling
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        candidates_p->data[i].logit /= dyn_temp;
+    }
+
+    // Re-compute softmax probabilities after scaling logits with dynamic temperature
+    double max_l_double = candidates_p->data[0].logit;
+    double cum_sum_double = 0.0;
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        double p = exp(candidates_p->data[i].logit - max_l_double);
+        candidates_p->data[i].p = p; // Store the scaled probability
+        cum_sum_double += p;
+    }
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+    }
+
+#ifdef DEBUG
+    // Print the updated top 25 probabilities after temperature scaling
+    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+    for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
+        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
+    }
+#endif
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
 void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
     const int64_t t_start_sample_us = ggml_time_us();
 
diff --git a/llama.h b/llama.h
index bb6054557..7b3634aa6 100644
--- a/llama.h
+++ b/llama.h
@@ -775,6 +775,14 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
 
+    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API void llama_sample_entropy(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates_p,
+                           float   min_temp,
+                           float   max_temp,
+                           float   exponent_val);
+
     LLAMA_API void llama_sample_temp(
             struct llama_context * ctx,
           llama_token_data_array * candidates,

From fe54033b69b83164cabb5f3ed92dc0ff7ea47605 Mon Sep 17 00:00:00 2001
From: XiaotaoChen <chenxiaotao1234@gmail.com>
Date: Fri, 26 Jan 2024 04:14:32 +0800
Subject: [PATCH 127/138] readme : add MobileVLM 1.7B/3B to the supported
 models list (#5107)

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index cbfba01bc..c2a33022c 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
+- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 
 
 **Bindings:**

From 1182cf4d4f6ee383b92695c2e3fe438086dcdba7 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Fri, 26 Jan 2024 09:14:39 +0200
Subject: [PATCH 128/138] Another bucket sort (#5109)

* Initial bucket sort

* Bucket sort: slightly better version

* Bucket sort: another minor improvement

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 823d42d7f..b03b67e16 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7956,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
         auto comp = [](const llama_token_data & a, const llama_token_data & b) {
             return a.logit > b.logit;
         };
-        if (k == (int) candidates->size) {
-            std::sort(candidates->data, candidates->data + candidates->size, comp);
-        } else {
+        if (k <= 128) {
             std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
+        } else {
+            constexpr int   nbuckets     = 128;
+            constexpr float bucket_low   = -10.0f;
+            constexpr float bucket_high  =  10.0f;
+            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+            constexpr float bucker_inter = -bucket_low * bucket_scale;
+
+            std::vector<int> bucket_idx(candidates->size);
+            std::vector<int> histo(nbuckets, 0);
+
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                const float val = candidates->data[i].logit;
+                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                ib = std::max(0, std::min(nbuckets-1, ib));
+                bucket_idx[i] = ib;
+                ++histo[ib];
+            }
+            int nhave = 0;
+            int ib = nbuckets - 1;
+            for ( ; ib >= 0; --ib) {
+                nhave += histo[ib];
+                if (nhave >= k) break;
+            }
+            std::vector<llama_token_data> tmp_tokens(nhave);
+            auto ptr = tmp_tokens.data();
+            std::vector<llama_token_data*> bucket_ptrs;
+            bucket_ptrs.reserve(nbuckets - ib);
+            for (int j = nbuckets - 1; j >= ib; --j) {
+                bucket_ptrs.push_back(ptr);
+                ptr += histo[j];
+            }
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                int j = bucket_idx[i];
+                if (j >= ib) {
+                    *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
+                }
+            }
+
+            ptr = tmp_tokens.data();
+            int ndone = 0;
+            for (int j = nbuckets-1; j > ib; --j) {
+                std::sort(ptr, ptr + histo[j], comp);
+                ptr += histo[j];
+                ndone += histo[j];
+            }
+            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+
+            std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+
         }
         candidates->sorted = true;
     }

From aad0b01d7380a7cdfe0dd42307b18c7b6bac9575 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 26 Jan 2024 10:52:33 +0200
Subject: [PATCH 129/138] readme : update hot topics

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2a33022c..76e48ce8a 100644
--- a/README.md
+++ b/README.md
@@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
+- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
 - Collecting Apple Silicon performance stats:
   - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
   - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
-- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 
 ----

From 38b431de232d1b736b5af19b8c7d72f7075a70bc Mon Sep 17 00:00:00 2001
From: Riceball LEE <snowyu.lee@gmail.com>
Date: Fri, 26 Jan 2024 17:10:28 +0800
Subject: [PATCH 130/138] gguf : fix "general.alignment" type in gguf_reader.py
 (#5136)

---
 gguf-py/gguf/gguf_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 8682765ed..5b6d4ba6b 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -107,7 +107,7 @@ class GGUFReader:
         offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
         new_align = self.fields.get('general.alignment')
         if new_align is not None:
-            if new_align.types != [GGUFValueType.UINT64]:
+            if new_align.types != [GGUFValueType.UINT32]:
                 raise ValueError('Bad type for general.alignment field')
             self.alignment = new_align.parts[-1][0]
         padding = offs % self.alignment

From 6dd3c28c9cd1ef74b49d79f47d668759346a3c6c Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Fri, 26 Jan 2024 12:16:07 +0000
Subject: [PATCH 131/138] metal : remove unused `n_buffers` and `buffers`
 (#5129)

---
 ggml-metal.m | 73 ++++++++++++----------------------------------------
 1 file changed, 16 insertions(+), 57 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 60fef1a19..ab3c84f7f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -26,15 +26,6 @@
 
 #define GGML_METAL_MAX_KERNELS 256
 
-struct ggml_metal_buffer {
-    const char * name;
-
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
 struct ggml_metal_kernel {
     id<MTLFunction>             function;
     id<MTLComputePipelineState> pipeline;
@@ -172,9 +163,6 @@ struct ggml_metal_context {
 
     dispatch_queue_t d_queue;
 
-    int n_buffers;
-    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-
     struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
 
     bool support_simdgroup_reduction;
@@ -242,24 +230,20 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     // Show all the Metal device instances in the system
     NSArray * devices = MTLCopyAllDevices();
     for (id<MTLDevice> device in devices) {
-        NSString * s = [device name];
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
     }
     [devices release]; // since it was created by a *Copy* C method
 #endif
 
     // Pick and show default Metal device
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    NSString * s = [device name];
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
     // Configure context
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
     ctx->device = device;
     ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
     ctx->queue  = [ctx->device newCommandQueue];
-    ctx->n_buffers = 0;
-
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     // load library
@@ -534,10 +518,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 static void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
 
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        [ctx->buffers[i].metal release];
-    }
-
     for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
         if (ctx->kernels[i].pipeline) {
             [ctx->kernels[i].pipeline release];
@@ -580,51 +560,30 @@ struct ggml_backend_metal_buffer_context {
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
     //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
     const int64_t tsize = ggml_nbytes(t);
 
     ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
 
-    // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
-        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
-
-        // find the view that contains the tensor fully
-        for (int i = 0; i < buf_ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
-
-            //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
-            if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
-                *offs = (size_t) ioffs;
-
-                //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
-
-                return buf_ctx->buffers[i].metal;
-            }
-        }
-
-        GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
-
-        return nil;
-    }
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
 
     // find the view that contains the tensor fully
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
 
-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
+        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
             *offs = (size_t) ioffs;
 
-            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
 
-            return ctx->buffers[i].metal;
+            return buf_ctx->buffers[i].metal;
         }
     }
 
-    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
+    GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
 
     return nil;
 }
@@ -817,9 +776,9 @@ static bool ggml_metal_graph_compute(
             const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
             const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
 
-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
 
             //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
             //if (src0) {
@@ -1601,7 +1560,7 @@ static bool ggml_metal_graph_compute(
                                 struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
 
                                 size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
 
                                 [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                             }
@@ -1746,7 +1705,7 @@ static bool ggml_metal_graph_compute(
                                 struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
 
                                 size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
 
                                 [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
                             }

From 413e7b0559f922bd4de5e9eec548829d111651b1 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.net>
Date: Fri, 26 Jan 2024 07:18:00 -0500
Subject: [PATCH 132/138] ci : add model tests + script wrapper (#4586)

* scripts : add lib.sh and lib_test.sh

* scripts : stub out new ci-run.sh script

* scripts : switch to PascalCase for functions

This looks a little odd at first, but I find it very useful as a
convention to know if a command is part of our code vs a builtin.

* scripts : add some fancy conversion from snake_case to PascalCase

* Add venv to ci/run.sh

* Revert scripts work

* scripts : add wrapper script for local use of ci/run.sh

* Simplify .gitignore for tests, clang-tidy fixes

* Label all ctest tests

* ci : ctest uses -L main

* Attempt at writing ctest_with_model

* Update test-model-load-cancel

* ci : add ctest_with_model for debug and release

ggml-ci

* Fix gg_get_model function

ggml-ci

* got stuck on CMake

* Add get_model.cpp to tests/CMakeLists.txt

ggml-ci

* Fix README.md output for ctest_with_model

ggml-ci

* workflows : use `-L main` for all ctest

ggml-ci

* Fixes

* GG_RUN_CTEST_MODELFILE => LLAMACPP_TESTMODELFILE
* Always show warning rather than failing if model file variable is not
  set

* scripts : update usage text for ci-run.sh
---
 .github/workflows/build.yml      | 12 ++---
 .gitignore                       | 19 +-------
 Makefile                         |  7 ++-
 ci/run.sh                        | 81 +++++++++++++++++++++++++++-----
 scripts/ci-run.sh                | 50 ++++++++++++++++++++
 tests/.gitignore                 |  2 +
 tests/CMakeLists.txt             | 14 ++++--
 tests/get-model.cpp              | 21 +++++++++
 tests/get-model.h                |  2 +
 tests/test-autorelease.cpp       | 12 ++---
 tests/test-model-load-cancel.cpp | 27 +++++++++++
 11 files changed, 199 insertions(+), 48 deletions(-)
 create mode 100755 scripts/ci-run.sh
 create mode 100644 tests/.gitignore
 create mode 100644 tests/get-model.cpp
 create mode 100644 tests/get-model.h
 create mode 100644 tests/test-model-load-cancel.cpp

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3aa6f992..d22a041a6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -72,7 +72,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -107,7 +107,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   ubuntu-latest-cmake-mpi:
     runs-on: ubuntu-latest
@@ -141,7 +141,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest -L main --verbose
 
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
@@ -202,7 +202,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   macOS-latest-cmake-ios:
     runs-on: macos-latest
@@ -394,7 +394,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
         run: |
           cd build
-          ctest -C Release --verbose --timeout 900
+          ctest -L main -C Release --verbose --timeout 900
 
       - name: Test (Intel SDE)
         id: cmake_test_sde
@@ -406,7 +406,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
-          & $sde -future -- ctest -C Release --verbose --timeout 900
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
 
       - name: Determine tag name
         id: tag
diff --git a/.gitignore b/.gitignore
index 5ab81445d..cb0069bfb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,7 @@
 lcov-report/
 gcovr-report/
 
-build*/
+build*
 out/
 tmp/
 
@@ -89,20 +89,3 @@ examples/jeopardy/results.txt
 
 poetry.lock
 poetry.toml
-
-# Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
-/tests/test-autorelease
diff --git a/Makefile b/Makefile
index a8658a596..82c89e87a 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-autorelease
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -748,5 +748,8 @@ tests/test-c.o: tests/test-c.c llama.h
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/ci/run.sh b/ci/run.sh
index 791b17a19..2427e55a2 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,9 +22,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
 
-rm -v $OUT/*.log
-rm -v $OUT/*.exit
-rm -v $OUT/*.md
+rm -f "$OUT/*.log"
+rm -f "$OUT/*.exit"
+rm -f "$OUT/*.md"
 
 sd=`dirname $0`
 cd $sd/../
@@ -94,7 +94,7 @@ function gg_run_ctest_debug {
     (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }
@@ -123,9 +123,9 @@ function gg_run_ctest_release {
     (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     fi
 
     set +e
@@ -141,6 +141,61 @@ function gg_sum_ctest_release {
     gg_printf '```\n'
 }
 
+function gg_get_model {
+    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_3b ]]; then
+        echo -n "$gguf_3b"
+    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_7b"
+    else
+        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
+        exit 1
+    fi
+}
+
+function gg_run_ctest_with_model_debug {
+    cd ${SRC}
+
+    local model; model=$(gg_get_model)
+    cd build-ci-debug
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+
+function gg_run_ctest_with_model_release {
+    cd ${SRC}
+
+    local model; model=$(gg_get_model)
+    cd build-ci-release
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+
+function gg_sum_ctest_with_model_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest with model files in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
+function gg_sum_ctest_with_model_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest with model files in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
 # open_llama_3b_v2
 
 function gg_run_open_llama_3b_v2 {
@@ -183,8 +238,6 @@ function gg_run_open_llama_3b_v2 {
 
     wiki_test_60="${path_wiki}/wiki.test-60.raw"
 
-    ./bin/test-autorelease ${model_f16}
-
     ./bin/quantize ${model_f16} ${model_q8_0} q8_0
     ./bin/quantize ${model_f16} ${model_q4_0} q4_0
     ./bin/quantize ${model_f16} ${model_q4_1} q4_1
@@ -507,14 +560,18 @@ function gg_sum_open_llama_7b_v2 {
 ## main
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
     rm -rf ${SRC}/models-mnt
-
     mnt_models=${MNT}/models
     mkdir -p ${mnt_models}
     ln -sfn ${mnt_models} ${SRC}/models-mnt
 
-    python3 -m pip install -r ${SRC}/requirements.txt
-    python3 -m pip install --editable gguf-py
+    # Create a fresh python3 venv and enter it
+    python3 -m venv "$MNT/venv"
+    source "$MNT/venv/bin/activate"
+
+    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
+    pip install --editable gguf-py --disable-pip-version-check
 fi
 
 ret=0
@@ -529,6 +586,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
         else
             test $ret -eq 0 && gg_run open_llama_7b_v2
         fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
+        test $ret -eq 0 && gg_run ctest_with_model_release
     fi
 fi
 
diff --git a/scripts/ci-run.sh b/scripts/ci-run.sh
new file mode 100755
index 000000000..06b5d9c6e
--- /dev/null
+++ b/scripts/ci-run.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -euo pipefail
+this=$(realpath "$0"); readonly this
+cd "$(dirname "$this")"
+shellcheck "$this"
+
+if (( $# != 1 && $# != 2  )); then
+    cat >&2 <<'EOF'
+usage:
+    ci-run.sh <tmp_dir> [<cache_dir>]
+
+This script wraps ci/run.sh:
+* If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
+    (openllama_3b_v2: quantized models are about 30GB)
+* Persistent model and data files are synced to and from <cache_dir>,
+    excluding generated .gguf files.
+    (openllama_3b_v2: persistent files are about 6.6GB)
+* <cache_dir> defaults to  ~/.cache/llama.cpp
+EOF
+    exit 1
+fi
+
+cd .. # => llama.cpp repo root
+
+tmp="$1"
+mkdir -p "$tmp"
+tmp=$(realpath "$tmp")
+echo >&2 "Using tmp=$tmp"
+
+cache="${2-$HOME/.cache/llama.cpp}"
+mkdir -p "$cache"
+cache=$(realpath "$cache")
+echo >&2 "Using cache=$cache"
+
+_sync() {
+    local from="$1"; shift
+    local to="$1"; shift
+
+    echo >&2 "Syncing from $from to $to"
+    mkdir -p "$from" "$to"
+    rsync -a "$from" "$to" --delete-during "$@"
+}
+
+_sync "$(realpath .)/" "$tmp/llama.cpp"
+_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
+
+cd "$tmp/llama.cpp"
+bash ci/run.sh ci-out ci-mnt
+
+_sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 000000000..59be43b99
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,2 @@
+*
+!*.*
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d7aaab843..3e40a78cd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 function(llama_build_executable source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source})
+    add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE common)
 endfunction()
@@ -8,14 +8,20 @@ endfunction()
 function(llama_test_executable name source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+    set_property(TEST ${name} PROPERTY LABELS "main")
 endfunction()
 
 function(llama_build_and_test_executable source)
+    llama_build_and_test_executable_with_label(${source} "main")
+endfunction()
+
+function(llama_build_and_test_executable_with_label source label)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source})
+    add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE common)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
 endfunction()
 
 # llama_build_and_test_executable(test-double-float.cpp) # SLOW
@@ -49,10 +55,12 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
 llama_build_and_test_executable(test-backend-ops.cpp)
-llama_build_and_test_executable(test-autorelease.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
 
+llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
+llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
+
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
diff --git a/tests/get-model.cpp b/tests/get-model.cpp
new file mode 100644
index 000000000..4edb685f0
--- /dev/null
+++ b/tests/get-model.cpp
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "get-model.h"
+
+char * get_model_or_exit(int argc, char *argv[]) {
+    char * model_path;
+    if (argc > 1) {
+        model_path = argv[1];
+
+    } else {
+        model_path = getenv("LLAMACPP_TEST_MODELFILE");
+        if (!model_path || strlen(model_path) == 0) {
+            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    return model_path;
+}
diff --git a/tests/get-model.h b/tests/get-model.h
new file mode 100644
index 000000000..81a3a0fef
--- /dev/null
+++ b/tests/get-model.h
@@ -0,0 +1,2 @@
+#pragma once
+char * get_model_or_exit(int, char*[]);
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
index 289c6ba6c..36a23c0bb 100644
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -5,19 +5,15 @@
 #include <thread>
 
 #include "llama.h"
+#include "get-model.h"
 
 // This creates a new context inside a pthread and then tries to exit cleanly.
 int main(int argc, char ** argv) {
-    if (argc < 2) {
-        printf("Usage: %s model.gguf\n", argv[0]);
-        return 0; // intentionally return success
-    }
+    auto * model_path = get_model_or_exit(argc, argv);
 
-    const std::string fname = argv[1];
-
-    std::thread([&fname]() {
+    std::thread([&model_path]() {
         llama_backend_init(false);
-        auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
+        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
         auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
         llama_free(ctx);
         llama_free_model(model);
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
new file mode 100644
index 000000000..7ea4bbacc
--- /dev/null
+++ b/tests/test-model-load-cancel.cpp
@@ -0,0 +1,27 @@
+#include "llama.h"
+#include "get-model.h"
+
+#include <cstdlib>
+
+int main(int argc, char *argv[] ) {
+    auto * model_path = get_model_or_exit(argc, argv);
+    auto * file = fopen(model_path, "r");
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "using '%s'\n", model_path);
+    fclose(file);
+
+    llama_backend_init(false);
+    auto params = llama_model_params{};
+    params.use_mmap = false;
+    params.progress_callback = [](float progress, void * ctx){
+        (void) ctx;
+        return progress > 0.50;
+    };
+    auto * model = llama_load_model_from_file(model_path, params);
+    llama_backend_free();
+    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From 48c857aa10aea73210a4a72da3f1a6f99269e75d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 26 Jan 2024 13:42:20 +0100
Subject: [PATCH 133/138] server : refactored the task processing logic (#5065)

* server: add llama_server_queue struct

* server: add llama_server_response_event

* server: add comments

* server: move all mutexes away from server.cpp

* server: correct multitask response

* server: only add back deferred tasks when one slot is available

* server: fix a race condition cause by "request_completion"
---
 Makefile                       |   2 +-
 examples/server/CMakeLists.txt |   2 +-
 examples/server/oai.hpp        | 208 ++++++++
 examples/server/server.cpp     | 849 ++++++---------------------------
 examples/server/utils.hpp      | 507 ++++++++++++++++++++
 5 files changed, 876 insertions(+), 692 deletions(-)
 create mode 100644 examples/server/oai.hpp
 create mode 100644 examples/server/utils.hpp

diff --git a/Makefile b/Makefile
index 82c89e87a..b8858b412 100644
--- a/Makefile
+++ b/Makefile
@@ -619,7 +619,7 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 81709e448..cc13b2d63 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp
new file mode 100644
index 000000000..bc5db6eef
--- /dev/null
+++ b/examples/server/oai.hpp
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+#include "utils.hpp"
+
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+
+using json = nlohmann::json;
+
+inline static json oaicompat_completion_params_parse(
+    const json &body /* openai api json semantics */)
+{
+    json llama_params;
+
+    llama_params["__oaicompat"] = true;
+
+    // Map OpenAI parameters to llama.cpp parameters
+    //
+    // For parameters that are defined by the OpenAI documentation (e.g.
+    // temperature), we explicitly specify OpenAI's intended default; we
+    // need to do that because sometimes OpenAI disagrees with llama.cpp
+    //
+    // https://platform.openai.com/docs/api-reference/chat/create
+    llama_sampling_params default_sparams;
+    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
+    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
+    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
+    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
+    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
+    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
+    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
+    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
+    llama_params["stream"]            = json_value(body, "stream", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
+    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
+
+    if (body.count("grammar") != 0) {
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
+    }
+
+    // Handle 'stop' field
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");
+
+    return llama_params;
+}
+
+inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
+{
+    json result = response.result_json;
+
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
+
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                  : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                         {"role", "assistant"}}}}});
+
+    std::time_t t = std::time(0);
+
+    json res =
+        json{{"choices", choices},
+            {"created", t},
+            {"model",
+                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+            {"usage",
+                json{{"completion_tokens", num_tokens_predicted},
+                     {"prompt_tokens",     num_prompt_tokens},
+                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
+            {"id", gen_chatcmplid()}};
+
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
+
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
+
+    return res;
+}
+
+// return value is vector as there is one case where we might need to generate two responses
+inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
+    json result = response.result_json;
+
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({response.result_json});
+    }
+
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+    bool stopped_word   = json_value(result, "stopped_word", false);
+    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content", std::string(""));
+
+    std::string finish_reason;
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+
+    std::time_t t = std::time(0);
+
+    json choices;
+
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}});
+    } else {
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
+    }
+
+    json ret = json{{"choices", choices},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
+
+    return std::vector<json>({ret});
+}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0462fbd24..392836132 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,8 @@
 #include "common.h"
 #include "llama.h"
 #include "grammar-parser.h"
+#include "utils.hpp"
+#include "oai.hpp"
 
 #include "../llava/clip.h"
 
@@ -23,17 +25,10 @@
 
 #include <cstddef>
 #include <thread>
-#include <mutex>
 #include <chrono>
 #include <condition_variable>
 #include <atomic>
 
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-
 using json = nlohmann::json;
 
 struct server_params
@@ -46,197 +41,7 @@ struct server_params
     int32_t write_timeout = 600;
 };
 
-static bool server_verbose = false;
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
-
-json oaicompat_completion_params_parse(const json &body);
-std::string format_chatml(std::vector<json> messages);
-
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
-
-//
-// parallel
-//
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-};
-
-struct task_server {
-    int id;
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
-    IDLE,
-    PROCESSING,
-};
-
-enum slot_command
-{
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
-};
-
-struct slot_params
-{
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
-
-    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-};
-
-struct slot_image
-{
-    int32_t id;
-
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
-};
-
-// completion token output with probabilities
-struct completion_token_output
-{
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
+bool server_verbose = false;
 
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
@@ -292,28 +97,6 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     return ret;
 }
 
-static void server_log(const char *level, const char *function, int line,
-                       const char *message, const nlohmann::ordered_json &extra)
-{
-    nlohmann::ordered_json log
-    {
-        {"timestamp", time(nullptr)},
-        {"level",     level},
-        {"function",  function},
-        {"line",      line},
-        {"message",   message},
-    };
-
-    if (!extra.empty())
-    {
-        log.merge_patch(extra);
-    }
-
-    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
-    fflush(stdout);
-}
-
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
@@ -355,15 +138,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
     return out;
 }
 
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
 struct llama_client_slot
 {
     int id;
@@ -491,7 +265,7 @@ struct llama_client_slot
     }
 
     void release() {
-        if (state == IDLE || state == PROCESSING)
+        if (state == PROCESSING)
         {
             t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
             command = RELEASE;
@@ -539,7 +313,6 @@ struct llama_server_context
     bool all_slots_are_idle = false;
     bool add_bos_token      = true;
 
-    int32_t id_gen;
     int32_t n_ctx;  // total context for all clients / slots
 
     // system prompt
@@ -554,13 +327,8 @@ struct llama_server_context
     // slots / clients
     std::vector<llama_client_slot> slots;
 
-    std::vector<task_server> queue_tasks;
-    std::vector<task_result> queue_results;
-    std::vector<task_multi>  queue_multitasks;
-    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
-    std::condition_variable condition_tasks;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
+    llama_server_queue queue_tasks;
+    llama_server_response queue_results;
 
     ~llama_server_context()
     {
@@ -619,8 +387,6 @@ struct llama_server_context
     }
 
     void initialize() {
-        id_gen = 0;
-
         // create slots
         all_slots_are_idle = true;
 
@@ -1183,39 +949,13 @@ struct llama_server_context
     void send_error(task_server& task, const std::string &error)
     {
         LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
-        std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
         res.id = task.id;
         res.multitask_id = task.multitask_id;
         res.stop = false;
         res.error = true;
         res.result_json = { { "content", error } };
-        queue_results.push_back(res);
-        condition_results.notify_all();
-    }
-
-    void add_multi_task(int id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-        condition_tasks.notify_one();
-    }
-
-    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-                condition_tasks.notify_one();
-            }
-        }
+        queue_results.send(res);
     }
 
     json get_model_props()
@@ -1261,7 +1001,6 @@ struct llama_server_context
 
     void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
     {
-        std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
         res.multitask_id = slot.multitask_id;
@@ -1296,13 +1035,11 @@ struct llama_server_context
             res.result_json["model"] = slot.oaicompat_model;
         }
 
-        queue_results.push_back(res);
-        condition_results.notify_all();
+        queue_results.send(res);
     }
 
     void send_final_response(llama_client_slot &slot)
     {
-        std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
         res.multitask_id = slot.multitask_id;
@@ -1351,22 +1088,11 @@ struct llama_server_context
             res.result_json["model"] = slot.oaicompat_model;
         }
 
-        queue_results.push_back(res);
-        condition_results.notify_all();
-
-        // done with results, unlock
-        lock.unlock();
-
-        // parent multitask, if any, needs to be updated
-        if (slot.multitask_id != -1)
-        {
-            update_multi_task(slot.multitask_id, slot.task_id, res);
-        }
+        queue_results.send(res);
     }
 
     void send_embedding(llama_client_slot &slot)
     {
-        std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
         res.id = slot.task_id;
         res.multitask_id = slot.multitask_id;
@@ -1393,15 +1119,13 @@ struct llama_server_context
                 {"embedding", embedding },
             };
         }
-        queue_results.push_back(res);
-        condition_results.notify_all();
+        queue_results.send(res);
     }
 
-    int request_completion(json data, bool infill, bool embedding, int multitask_id)
+    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
     {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         task_server task;
-        task.id = id_gen++;
+        task.id = task_id;
         task.target_id = 0;
         task.data = std::move(data);
         task.infill_mode = infill;
@@ -1412,47 +1136,11 @@ struct llama_server_context
         // when a completion task's prompt array is not a singleton, we split it into multiple requests
         if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
         {
-            lock.unlock(); // entering new func scope
-            return split_multiprompt_task(task);
+            split_multiprompt_task(task_id, task);
         }
 
         // otherwise, it's a single-prompt task, we actually queue it
-        queue_tasks.push_back(task);
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    task_result next_result(int task_id)
-    {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-                if (queue_results[i].multitask_id == task_id)
-                {
-                    update_multi_task(task_id, queue_results[i].id, queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
-                    continue;
-                }
-
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // never reached
-        //return task_result{-1, false, false, {}};
+        queue_tasks.post(task);
     }
 
     // for multiple images processing
@@ -1525,150 +1213,117 @@ struct llama_server_context
 
     void request_cancel(int task_id)
     {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         task_server task;
-        task.id = id_gen++;
         task.type = TASK_TYPE_CANCEL;
         task.target_id = task_id;
-        queue_tasks.push_back(task);
-        condition_tasks.notify_one();
+        queue_tasks.post(task);
     }
 
-    int split_multiprompt_task(task_server& multiprompt_task)
+    void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
     {
         int prompt_count = multiprompt_task.data.at("prompt").size();
         assert(prompt_count > 1);
 
-        int multitask_id = id_gen++;
+        // generate all the ID for subtask
         std::vector<int> subtask_ids(prompt_count);
         for (int i = 0; i < prompt_count; i++)
+        {
+            subtask_ids[i] = queue_tasks.get_new_id();
+        }
+
+        // queue up the multitask so we can track its subtask progression
+        queue_tasks.add_multitask(multitask_id, subtask_ids);
+
+        // add subtasks
+        for (int i = 0; i < prompt_count; i++)
         {
             json subtask_data = multiprompt_task.data;
             subtask_data["prompt"] = subtask_data["prompt"][i];
 
             // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
         }
-
-        // queue up the multitask so we can track its subtask progression
-        add_multi_task(multitask_id, subtask_ids);
-        return multitask_id;
     }
 
-    void process_tasks()
+    void process_single_task(task_server& task)
     {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        std::vector<task_server> deferred_tasks;
-        while (!queue_tasks.empty())
+        switch (task.type)
         {
-            task_server task = queue_tasks.front();
-            queue_tasks.erase(queue_tasks.begin());
-            switch (task.type)
-            {
-                case TASK_TYPE_COMPLETION: {
-                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
-                    if (slot == nullptr)
-                    {
-                        // if no slot is available, we defer this task for processing later
-                        deferred_tasks.push_back(task);
-                        break;
-                    }
-
-                    if (task.data.contains("system_prompt"))
-                    {
-                        if (!all_slots_are_idle) {
-                            send_error(task, "system prompt can only be updated when all slots are idle");
-                            break;
-                        }
-                        process_system_prompt_data(task.data["system_prompt"]);
-
-                        // reset cache_tokens for all slots
-                        for (llama_client_slot &slot : slots)
-                        {
-                            slot.cache_tokens.clear();
-                        }
-                    }
-
-                    slot->reset();
-
-                    slot->infill       = task.infill_mode;
-                    slot->embedding    = task.embedding_mode;
-                    slot->task_id      = task.id;
-                    slot->multitask_id = task.multitask_id;
-
-                    if (!launch_slot_with_data(slot, task.data))
-                    {
-                        // send error result
-                        send_error(task, "internal_error");
-                        break;
-                    }
-                } break;
-                case TASK_TYPE_CANCEL: { // release slot linked with the task id
-                    for (auto & slot : slots)
-                    {
-                        if (slot.task_id == task.target_id)
-                        {
-                            slot.release();
-                            break;
-                        }
-                    }
-                } break;
-            }
-        }
-
-        // add all the deferred tasks back the the queue
-        for (task_server &task : deferred_tasks)
-        {
-            queue_tasks.push_back(task);
-        }
-
-        // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
-        std::vector<task_result> agg_results;
-        auto queue_iterator = queue_multitasks.begin();
-        while (queue_iterator != queue_multitasks.end())
-        {
-            if (queue_iterator->subtasks_remaining.empty())
-            {
-                // all subtasks done == multitask is done
-                task_result aggregate_result;
-                aggregate_result.id = queue_iterator->id;
-                aggregate_result.stop = true;
-                aggregate_result.error = false;
-
-                // collect json results into one json result
-                std::vector<json> result_jsons;
-                for (auto& subres : queue_iterator->results)
+            case TASK_TYPE_COMPLETION: {
+                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                if (slot == nullptr)
                 {
-                    result_jsons.push_back(subres.result_json);
-                    aggregate_result.error = aggregate_result.error && subres.error;
+                    // if no slot is available, we defer this task for processing later
+                    LOG_VERBOSE("no slot is available", {});
+                    queue_tasks.defer(task);
+                    break;
                 }
-                aggregate_result.result_json = json{ "results", result_jsons };
 
+                if (task.data.contains("system_prompt"))
+                {
+                    if (!all_slots_are_idle) {
+                        send_error(task, "system prompt can only be updated when all slots are idle");
+                        break;
+                    }
+                    process_system_prompt_data(task.data["system_prompt"]);
 
-                agg_results.push_back(aggregate_result);
+                    // reset cache_tokens for all slots
+                    for (llama_client_slot &slot : slots)
+                    {
+                        slot.cache_tokens.clear();
+                    }
+                }
 
-                condition_results.notify_all();
+                slot->reset();
 
-                queue_iterator = queue_multitasks.erase(queue_iterator);
-            }
-            else
-            {
-                ++queue_iterator;
-            }
+                slot->infill       = task.infill_mode;
+                slot->embedding    = task.embedding_mode;
+                slot->task_id      = task.id;
+                slot->multitask_id = task.multitask_id;
+
+                if (!launch_slot_with_data(slot, task.data))
+                {
+                    // send error result
+                    send_error(task, "internal_error");
+                    break;
+                }
+            } break;
+            case TASK_TYPE_CANCEL: { // release slot linked with the task id
+                for (auto & slot : slots)
+                {
+                    if (slot.task_id == task.target_id)
+                    {
+                        slot.release();
+                        break;
+                    }
+                }
+            } break;
+            case TASK_TYPE_NEXT_RESPONSE: {
+                // do nothing
+            } break;
         }
+    }
 
-        // done with tasks, unlock
-        lock.unlock();
+    void on_finish_multitask(task_multi& multitask)
+    {
+        // all subtasks done == multitask is done
+        task_result result;
+        result.id = multitask.id;
+        result.stop = true;
+        result.error = false;
 
-        // copy aggregate results of complete multi-tasks to the results queue
-        std::lock_guard<std::mutex> lock_results(mutex_results);
-        queue_results.insert(queue_results.end(), agg_results.begin(), agg_results.end());
+        // collect json results into one json result
+        std::vector<json> result_jsons;
+        for (auto& subres : multitask.results)
+        {
+            result_jsons.push_back(subres.result_json);
+            result.error = result.error && subres.error;
+        }
+        result.result_json = json{ { "results", result_jsons } };
+        queue_results.send(result);
     }
 
     bool update_slots() {
-        // attend tasks
-        process_tasks();
-
         if (system_need_update)
         {
             LOG_TEE("updating system prompt\n");
@@ -1684,10 +1339,12 @@ struct llama_server_context
                 LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
                 kv_cache_clear();
             }
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            condition_tasks.wait(lock, [&]{
-                return !queue_tasks.empty();
-            });
+            return true;
+        } else {
+            task_server task;
+            task.type = TASK_TYPE_NEXT_RESPONSE;
+            task.target_id = -1;
+            queue_tasks.post(task);
         }
 
         for (llama_client_slot &slot : slots)
@@ -1732,6 +1389,7 @@ struct llama_server_context
                 slot.t_last_used = ggml_time_us();
 
                 LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                queue_tasks.notify_slot_changed();
 
                 continue;
             }
@@ -1997,6 +1655,10 @@ struct llama_server_context
         }
         return true;
     }
+
+    void run_on_all_tasks_finished() {
+        update_slots();
+    }
 };
 
 static void server_print_usage(const char *argv0, const gpt_params &params,
@@ -2541,239 +2203,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
     }
 }
 
-static std::string random_string()
-{
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
-    }
-
-    return result;
-}
-
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
-}
-
-std::string format_chatml(std::vector<json> messages)
-{
-    std::ostringstream chatml_msgs;
-
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
-        chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
-        chatml_msgs << json_value(*it, "content", std::string(""))
-                    << "<|im_end|>\n";
-    }
-
-    chatml_msgs << "<|im_start|>assistant" << '\n';
-
-    return chatml_msgs.str();
-}
-
 /* llama.cpp completion api semantics */
-json oaicompat_completion_params_parse(
-    const json &body /* openai api json semantics */)
-{
-    json llama_params;
-
-    llama_params["__oaicompat"] = true;
-
-    // Map OpenAI parameters to llama.cpp parameters
-    //
-    // For parameters that are defined by the OpenAI documentation (e.g.
-    // temperature), we explicitly specify OpenAI's intended default; we
-    // need to do that because sometimes OpenAI disagrees with llama.cpp
-    //
-    // https://platform.openai.com/docs/api-reference/chat/create
-    llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
-    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
-    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
-
-    if (body.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
-
-    // Handle 'stop' field
-    if (body.contains("stop") && body["stop"].is_string()) {
-        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
-    } else {
-        llama_params["stop"] = json_value(body, "stop", json::array());
-    }
-
-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
-
-    return llama_params;
-}
-
-static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
-    json result = response.result_json;
-
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
-
-    std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-
-    json choices =
-        streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
-                  : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", content},
-                                                         {"role", "assistant"}}}}});
-
-    std::time_t t = std::time(0);
-
-    json res =
-        json{{"choices", choices},
-            {"created", t},
-            {"model",
-                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-            {"usage",
-                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
-            {"id", gen_chatcmplid()}};
-
-    if (server_verbose) {
-        res["__verbose"] = result;
-    }
-
-    if (result.contains("completion_probabilities")) {
-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    }
-
-    return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
-    json result = response.result_json;
-
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-        return std::vector<json>({response.result_json});
-    }
-
-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-    bool stopped_word   = json_value(result, "stopped_word", false);
-    bool stopped_eos    = json_value(result, "stopped_eos", false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content", std::string(""));
-
-    std::string finish_reason;
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-    if (stopped_limit) {
-        finish_reason = "length";
-    }
-
-    std::time_t t = std::time(0);
-
-    json choices;
-
-    if (!finish_reason.empty()) {
-        choices = json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            // Some idiosyncrasy in task processing logic makes several trailing calls
-            // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
-                return std::vector<json>({json::object()});
-            }
-
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json{
-                    {"content", content},
-                }},
-            }});
-        }
-    }
-
-    json ret = json{{"choices", choices},
-                    {"created", t},
-                    {"id", gen_chatcmplid()},
-                    {"model", modelname},
-                    {"object", "chat.completion.chunk"}};
-
-    return std::vector<json>({ret});
-}
-
 static json format_partial_response(
     llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
@@ -3069,10 +2499,12 @@ int main(int argc, char **argv)
                     return;
                 }
                 json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false, false, -1);
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, data, false, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
-                    task_result result = llama.next_result(task_id);
+                    task_result result = llama.queue_results.recv(task_id);
                     if (!result.error && result.stop) {
                         res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
                     }
@@ -3080,14 +2512,14 @@ int main(int argc, char **argv)
                     {
                         res.status = 404;
                         res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                        return;
                     }
+                    llama.queue_results.remove_waiting_task_id(task_id);
                 } else {
                     const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
                     {
                         while (true)
                         {
-                            task_result result = llama.next_result(task_id);
+                            task_result result = llama.queue_results.recv(task_id);
                             if (!result.error) {
                                 const std::string str =
                                     "data: " +
@@ -3098,6 +2530,7 @@ int main(int argc, char **argv)
                                 });
                                 if (!sink.write(str.c_str(), str.size()))
                                 {
+                                    llama.queue_results.remove_waiting_task_id(task_id);
                                     return false;
                                 }
                                 if (result.stop) {
@@ -3113,11 +2546,14 @@ int main(int argc, char **argv)
                                 });
                                 if (!sink.write(str.c_str(), str.size()))
                                 {
+                                    llama.queue_results.remove_waiting_task_id(task_id);
                                     return false;
                                 }
                                 break;
                             }
                         }
+
+                        llama.queue_results.remove_waiting_task_id(task_id);
                         sink.done();
                         return true;
                     };
@@ -3126,6 +2562,7 @@ int main(int argc, char **argv)
                     {
                         // cancel
                         llama.request_cancel(task_id);
+                        llama.queue_results.remove_waiting_task_id(task_id);
                     };
 
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
@@ -3162,11 +2599,13 @@ int main(int argc, char **argv)
                 }
                 json data = oaicompat_completion_params_parse(json::parse(req.body));
 
-                const int task_id = llama.request_completion(data, false, false, -1);
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, data, false, false, -1);
 
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
-                    task_result result = llama.next_result(task_id);
+                    task_result result = llama.queue_results.recv(task_id);
 
                     if (!result.error && result.stop) {
                         json oaicompat_result = format_final_response_oaicompat(data, result);
@@ -3177,12 +2616,12 @@ int main(int argc, char **argv)
                     } else {
                         res.status = 500;
                         res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                        return;
                     }
+                    llama.queue_results.remove_waiting_task_id(task_id);
                 } else {
                     const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
                         while (true) {
-                            task_result llama_result = llama.next_result(task_id);
+                            task_result llama_result = llama.queue_results.recv(task_id);
                             if (!llama_result.error) {
                                 std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
 
@@ -3195,6 +2634,7 @@ int main(int argc, char **argv)
                                             "\n\n";
                                         LOG_VERBOSE("data stream", {{"to_send", str}});
                                         if (!sink.write(str.c_str(), str.size())) {
+                                            llama.queue_results.remove_waiting_task_id(task_id);
                                             return false;
                                         }
                                     }
@@ -3210,18 +2650,21 @@ int main(int argc, char **argv)
                                     "\n\n";
                                 LOG_VERBOSE("data stream", {{"to_send", str}});
                                 if (!sink.write(str.c_str(), str.size())) {
+                                    llama.queue_results.remove_waiting_task_id(task_id);
                                     return false;
                                 }
                                 break;
                             }
                         }
                         sink.done();
+                        llama.queue_results.remove_waiting_task_id(task_id);
                         return true;
                     };
 
                     auto on_complete = [task_id, &llama](bool) {
                         // cancel request
                         llama.request_cancel(task_id);
+                        llama.queue_results.remove_waiting_task_id(task_id);
                     };
 
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
@@ -3235,10 +2678,12 @@ int main(int argc, char **argv)
                     return;
                 }
                 json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true, false, -1);
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, data, true, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
-                    task_result result = llama.next_result(task_id);
+                    task_result result = llama.queue_results.recv(task_id);
                     if (!result.error && result.stop)
                     {
                         res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
@@ -3247,13 +2692,13 @@ int main(int argc, char **argv)
                     {
                         res.status = 404;
                         res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                        return;
                     }
+                    llama.queue_results.remove_waiting_task_id(task_id);
                 } else {
                     const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
                         while (true)
                         {
-                            task_result result = llama.next_result(task_id);
+                            task_result result = llama.queue_results.recv(task_id);
                             if (!result.error) {
                                 const std::string str =
                                 "data: " +
@@ -3264,6 +2709,7 @@ int main(int argc, char **argv)
                                 });
                                 if (!sink.write(str.c_str(), str.size()))
                                 {
+                                    llama.queue_results.remove_waiting_task_id(task_id);
                                     return false;
                                 }
                                 if (result.stop)
@@ -3277,8 +2723,8 @@ int main(int argc, char **argv)
                             }
                         }
 
+                        llama.queue_results.remove_waiting_task_id(task_id);
                         sink.done();
-
                         return true;
                     };
 
@@ -3352,23 +2798,46 @@ int main(int argc, char **argv)
                     image_data = "";
                 }
 
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
-                task_result result = llama.next_result(task_id);
+                // create and queue the task
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
+
+                // get the result
+                task_result result = llama.queue_results.recv(task_id);
+                llama.queue_results.remove_waiting_task_id(task_id);
+
+                // send the result
                 return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
             });
 
     // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
     //     "Bus error: 10" - this is on macOS, it does not crash on Linux
     //std::thread t2([&]()
-    {
+    /*{
         bool running = true;
         while (running)
         {
             running = llama.update_slots();
         }
-    }
+    }*/
     //);
 
+    llama.queue_tasks.on_new_task(std::bind(
+        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
+    llama.queue_tasks.on_finish_multitask(std::bind(
+        &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
+    llama.queue_tasks.on_all_tasks_finished(std::bind(
+        &llama_server_context::run_on_all_tasks_finished, &llama));
+    llama.queue_results.on_multitask_update(std::bind(
+        &llama_server_queue::update_multitask,
+        &llama.queue_tasks,
+        std::placeholders::_1,
+        std::placeholders::_2,
+        std::placeholders::_3
+    ));
+    llama.queue_tasks.start_loop();
+
     t.join();
 
     llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
new file mode 100644
index 000000000..e2b6065f7
--- /dev/null
+++ b/examples/server/utils.hpp
@@ -0,0 +1,507 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+
+#include "../llava/clip.h"
+
+using json = nlohmann::json;
+
+extern bool server_verbose;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+    SERVER_STATE_ERROR           // An error occurred, load_model failed
+};
+
+enum task_type {
+    TASK_TYPE_COMPLETION,
+    TASK_TYPE_CANCEL,
+    TASK_TYPE_NEXT_RESPONSE
+};
+
+struct task_server {
+    int id = -1; // to be filled by llama_server_queue
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+    int multitask_id = -1;
+};
+
+struct task_result {
+    int id;
+    int multitask_id = -1;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+    std::string text_to_send;
+};
+
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
+        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
+    };
+
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
+    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+
+//
+// server utils
+//
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+//
+// work queue utils
+//
+
+struct llama_server_queue {
+    int id = 0;
+    std::mutex mutex_tasks;
+    // queues
+    std::vector<task_server> queue_tasks;
+    std::vector<task_server> queue_tasks_deferred;
+    std::vector<task_multi> queue_multitasks;
+    std::condition_variable condition_tasks;
+    // callback functions
+    std::function<void(task_server&)> callback_new_task;
+    std::function<void(task_multi&)> callback_finish_multitask;
+    std::function<void(void)> callback_all_task_finished;
+
+    // Add a new task to the end of the queue
+    int post(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        queue_tasks.push_back(std::move(task));
+        condition_tasks.notify_one();
+        return task.id;
+    }
+
+    // Add a new task, but defer until one slot is available
+    void defer(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        queue_tasks_deferred.push_back(std::move(task));
+    }
+
+    // Get the next id for creating anew task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return id++;
+    }
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(task_server&)> callback) {
+        callback_new_task = callback;
+    }
+
+    // Register function to process a multitask
+    void on_finish_multitask(std::function<void(task_multi&)> callback) {
+        callback_finish_multitask = callback;
+    }
+
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
+    }
+
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
+        }
+        queue_tasks_deferred.clear();
+    }
+
+    // Start the main loop. This call is blocking
+    void start_loop() {
+        while (true) {
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
+            {
+                while (true)
+                {
+                    std::unique_lock<std::mutex> lock(mutex_tasks);
+                    if (queue_tasks.empty()) {
+                        lock.unlock();
+                        break;
+                    }
+                    task_server task = queue_tasks.front();
+                    queue_tasks.erase(queue_tasks.begin());
+                    lock.unlock();
+                    LOG_VERBOSE("callback_new_task", {});
+                    callback_new_task(task);
+                }
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
+                auto queue_iterator = queue_multitasks.begin();
+                while (queue_iterator != queue_multitasks.end())
+                {
+                    if (queue_iterator->subtasks_remaining.empty())
+                    {
+                        // all subtasks done == multitask is done
+                        task_multi current_multitask = *queue_iterator;
+                        callback_finish_multitask(current_multitask);
+                        // remove this multitask
+                        queue_iterator = queue_multitasks.erase(queue_iterator);
+                    }
+                    else
+                    {
+                        ++queue_iterator;
+                    }
+                }
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
+            }
+            LOG_VERBOSE("wait for new task", {});
+            // wait for new task
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    condition_tasks.wait(lock, [&]{
+                        return !queue_tasks.empty();
+                    });
+                }
+            }
+        }
+    }
+
+    //
+    // functions to manage multitasks
+    //
+
+    // add a multitask by specifying the id of all subtask (subtask is a task_server)
+    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = multitask_id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    // updatethe remaining subtasks, while appending results to multitask
+    void update_multitask(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+};
+
+struct llama_server_response {
+    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
+    callback_multitask_t callback_update_multitask;
+    // for keeping track of all tasks waiting for the result
+    std::set<int> waiting_task_ids;
+    // the main result queue
+    std::vector<task_result> queue_results;
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    void add_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(task_id);
+    }
+
+    void remove_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(task_id);
+    }
+
+    // This function blocks the thread until there is a response for this task_id
+    task_result recv(int task_id) {
+        while (true)
+        {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+            LOG_VERBOSE("condition_results unblock", {});
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    assert(queue_results[i].multitask_id == -1);
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // Register the function to update multitask
+    void on_multitask_update(callback_multitask_t callback) {
+        callback_update_multitask = callback;
+    }
+
+    // Send a new result to a waiting task_id
+    void send(task_result result) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        LOG_VERBOSE("send new result", {});
+        for (auto& task_id : waiting_task_ids) {
+            // LOG_TEE("waiting task id %i \n", task_id);
+            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+            if (result.multitask_id == task_id)
+            {
+                LOG_VERBOSE("callback_update_multitask", {});
+                callback_update_multitask(task_id, result.id, result);
+                continue;
+            }
+
+            if (result.id == task_id)
+            {
+                LOG_VERBOSE("queue_results.push_back", {});
+                queue_results.push_back(result);
+                condition_results.notify_one();
+                return;
+            }
+        }
+    }
+};
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}

From 3b7c914de25c6851396d7f9178249f1ed278120e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 26 Jan 2024 14:48:15 +0200
Subject: [PATCH 134/138] tests : gitignore test-c.o

---
 tests/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/.gitignore b/tests/.gitignore
index 59be43b99..092dce742 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,2 +1,3 @@
 *
 !*.*
+test-c.o

From 5f1925a8cef81eb9b372faaae34b0dd76d5361d4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 26 Jan 2024 17:09:44 +0200
Subject: [PATCH 135/138] scripts : move run-with-preset.py from root to
 scripts folder

---
 run_with_preset.py => scripts/run-with-preset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename run_with_preset.py => scripts/run-with-preset.py (98%)

diff --git a/run_with_preset.py b/scripts/run-with-preset.py
similarity index 98%
rename from run_with_preset.py
rename to scripts/run-with-preset.py
index 9b4d7ecbe..a18252730 100755
--- a/run_with_preset.py
+++ b/scripts/run-with-preset.py
@@ -46,7 +46,7 @@ Formatting considerations:
 - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
 - To define a tensor split, pass a list of floats.
 """
-usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
+usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
 epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
           "Unknown args will be ignored.")
 

From 7032f4f6349c17a8352f9f93f7d2122f45469e59 Mon Sep 17 00:00:00 2001
From: snadampal <87143774+snadampal@users.noreply.github.com>
Date: Fri, 26 Jan 2024 11:17:59 -0600
Subject: [PATCH 136/138] ggml : update softmax n_task calculation (#5126)

updated the n_task calculation to use max number of
threads possible. This has improved the prompt eval
performance by around 5% for DOT kernels and by
around 10% for MMLA kernels on AWS Graviton3.
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index ca98fde8a..ef6fd8caf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
             } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {

From 15b4538ff29b280a395a1406d711497d8eaa2564 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 26 Jan 2024 18:18:26 +0100
Subject: [PATCH 137/138] ggml-alloc : add 10% margin to the buffer sizes
 (#5149)

---
 ggml-alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 60141a34d..95a93c99d 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
 }
 
 size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    return alloc->max_size;
+    // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
+    // to avoid this, we add a 10% margin to the buffer size
+    return alloc->max_size + alloc->max_size/10;
 }
 
 // graph allocator

From 62fead3ea0a30c8d424f4a8373fa14165c7c707f Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 26 Jan 2024 18:59:43 +0100
Subject: [PATCH 138/138] cuda : fix tensor size calculation for non-split
 buffer (#5145)

---
 ggml-backend.c |  4 +++-
 ggml-cuda.cu   | 19 +++++--------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 423512def..3fff5fc87 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
 GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
-        return buft->iface.get_alloc_size(buft, tensor);
+        size_t size = buft->iface.get_alloc_size(buft, tensor);
+        assert(size >= ggml_nbytes(tensor));
+        return size;
     }
     return ggml_nbytes(tensor);
 }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 05e5d18ab..0d599e20a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
     // TODO: mmq/mmv support
 #endif
 
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
+    const size_t nb11 = src1->nb[1];
+    const size_t nb1  =  dst->nb[1];
 
     const struct ggml_tensor * ids = src0;
     const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
 
     if (ggml_is_quantized(tensor->type)) {
         // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t original_size = ggml_nbytes(tensor);
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
     }
 }
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
 }
 
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
+    size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
 
     if (ggml_is_quantized(tensor->type)) {