Merge branch 'master' into removing-extraneous-nil-check

2024-01-16 18:14:51 +00:00 · 2024-01-16 18:14:51 +00:00 · 7434324414
commit 7434324414
parent 5137bc0052 bee938da74
9 changed files with 393 additions and 42 deletions
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1799,7 +1799,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> train_tokens;
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
+    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
        size_t n_tokens = tokens.size() - i_chunk * n_batch;
        n_tokens = std::min(n_tokens, size_t(n_batch));
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return {};
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@ -1,5 +1,5 @@
 # Function calling example using pydantic models.
-
+import datetime
 import json
 from enum import Enum
 from typing import Union, Optional
@ -8,7 +8,8 @@ import requests
 from pydantic import BaseModel, Field
 import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
+from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
 # Function to get completion on the llama.cpp server with grammar.
 def create_completion(prompt, grammar):
@ -134,3 +135,121 @@ text = create_completion(prompt=prompt, grammar=gbnf_grammar)
 json_data = json.loads(text)
 print(Book(**json_data))
 # An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
 def get_current_datetime(output_format: Optional[str] = None):
    """
    Get the current date and time in the given format.
    Args:
         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
    """
    if output_format is None:
        output_format = '%Y-%m-%d %H:%M:%S'
    return datetime.datetime.now().strftime(output_format)
 # Enum for the calculator tool.
 class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
    MULTIPLY = "multiply"
    DIVIDE = "divide"
 # Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")
    def run(self):
        if self.operation == MathOperation.ADD:
            return self.number_one + self.number_two
        elif self.operation == MathOperation.SUBTRACT:
            return self.number_one - self.number_two
        elif self.operation == MathOperation.MULTIPLY:
            return self.number_one * self.number_two
        elif self.operation == MathOperation.DIVIDE:
            return self.number_one / self.number_two
        else:
            raise ValueError("Unknown operation.")
 # Example function to get the weather
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
    if "London" in location:
        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
    elif "New York" in location:
        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
    elif "North Pole" in location:
        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})
 # Here is a function definition in OpenAI style
 current_weather_tool = {
    "type": "function",
    "function": {
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
            "required": ["location"],
        },
    },
 }
 # Convert OpenAI function definition into pydantic model
 current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
 # Add the actual function to a pydantic model
 current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
 # Convert normal Python function to a pydantic model
 current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
 tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
 gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
    pydantic_model_list=tool_list, outer_object_name="function",
    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
 system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
 text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
 prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
 text = create_completion(prompt=prompt, grammar=gbnf_grammar)
 json_data = json.loads(text)
 print(json_data)
 # Should output something like this:
 # [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
 for call in json_data:
    if call["function"] == "Calculator":
        print(Calculator(**call["params"]).run())
    elif call["function"] == "get_current_datetime":
        print(current_datetime_model(**call["params"]).run())
    elif call["function"] == "get_current_weather":
        print(current_weather_tool_model(**call["params"]).run())
 # Should output something like this:
 # 2024-01-14 13:36:06
 # {"location": "London", "temperature": "42", "unit": "celsius"}
 # 1764
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1701473968,
+        "lastModified": 1704982712,
-        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
+        "narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
+        "rev": "07f6395285469419cf9d078f59b5b49993198c00",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1703637592,
+        "lastModified": 1705133751,
-        "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
+        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
+        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
        "type": "github"
      },
      "original": {
@ -37,11 +37,11 @@
    "nixpkgs-lib": {
      "locked": {
        "dir": "lib",
-        "lastModified": 1701253981,
+        "lastModified": 1703961334,
-        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
+        "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
+        "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -6,28 +6,41 @@
    flake-parts.url = "github:hercules-ci/flake-parts";
  };
-  # Optional binary cache
+  # There's an optional binary cache available. The details are below, but they're commented out.
-  nixConfig = {
+  #
-    extra-substituters = [
+  # Why? The terrible experience of being prompted to accept them on every single Nix command run.
-      # Populated by the CI in ggerganov/llama.cpp
+  # Plus, there are warnings shown about not being a trusted user on a default Nix install
-      "https://llama-cpp.cachix.org"
+  # if you *do* say yes to the prompts.
-
+  #
-      # A development cache for nixpkgs imported with `config.cudaSupport = true`.
+  # This experience makes having `nixConfig` in a flake a persistent UX problem.
-      # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
+  #
-      # This lets one skip building e.g. the CUDA-enabled openmpi.
+  # To make use of the binary cache, please add the relevant settings to your `nix.conf`.
-      # TODO: Replace once nix-community obtains an official one.
+  # It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
-      "https://cuda-maintainers.cachix.org"
+  # option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
-    ];
+  # as shown below.
-
+  #
-    # Verify these are the same keys as published on
+  # ```
-    # - https://app.cachix.org/cache/llama-cpp
+  # nixConfig = {
-    # - https://app.cachix.org/cache/cuda-maintainers
+  #   extra-substituters = [
-    extra-trusted-public-keys = [
+  #     # Populated by the CI in ggerganov/llama.cpp
-      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
+  #     "https://llama-cpp.cachix.org"
-      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
+  #
-    ];
+  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
-  };
+  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
-
+  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
  #     # TODO: Replace once nix-community obtains an official one.
  #     "https://cuda-maintainers.cachix.org"
  #   ];
  #
  #   # Verify these are the same keys as published on
  #   # - https://app.cachix.org/cache/llama-cpp
  #   # - https://app.cachix.org/cache/cuda-maintainers
  #   extra-trusted-public-keys = [
  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  #   ];
  # };
  # ```
  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
  #
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    quantize_row_q4_0_reference(x, y, k);
 }
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
    const int qk = QK4_1;
@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
    return nrow * row_size;
 }
 static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK4_0 == 32, "QK4_0 must be 32");
    if (!quant_weights) {
        quantize_row_q4_0_reference(x, y, n_per_row);
        return;
    }
    float weight[QK4_0];
    int8_t L[QK4_0];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK4_0;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK4_0 * ib;
        const float * qw = quant_weights + QK4_0 * ib;
        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
        y[ib].d = GGML_FP32_TO_FP16(d);
        for (int j = 0; j < 16; ++j) {
            y[ib].qs[j] = L[j] | (L[j+16] << 4);
        }
    }
 }
 size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK4_1 == 32, "QK4_1 must be 32");
    if (!quant_weights) {
        quantize_row_q4_1_reference(x, y, n_per_row);
        return;
    }
    float weight[QK4_1];
    uint8_t L[QK4_1], Laux[QK4_1];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK4_1;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK4_1 * ib;
        const float * qw = quant_weights + QK4_1 * ib;
        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float min;
        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
        y[ib].d = GGML_FP32_TO_FP16(d);
        y[ib].m = GGML_FP32_TO_FP16(-min);
        for (int j = 0; j < 16; ++j) {
            y[ib].qs[j] = L[j] | (L[j+16] << 4);
        }
    }
 }
 size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK5_0 == 32, "QK5_0 must be 32");
    if (!quant_weights) {
        quantize_row_q5_0_reference(x, y, n_per_row);
        return;
    }
    float weight[QK5_0];
    int8_t L[QK5_0];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK5_0;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK5_0 * ib;
        const float * qw = quant_weights + QK5_0 * ib;
        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
        y[ib].d = GGML_FP32_TO_FP16(d);
        uint32_t qh = 0;
        for (int j = 0; j < 16; ++j) {
            const uint8_t xi0 = L[j];
            const uint8_t xi1 = L[j+16];
            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
            // get the 5-th bit and store it in qh at the right position
            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
        }
        memcpy(&y[ib].qh, &qh, sizeof(qh));
    }
 }
 size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
    static_assert(QK5_1 == 32, "QK5_1 must be 32");
    if (!quant_weights) {
        quantize_row_q5_1_reference(x, y, n_per_row);
        return;
    }
    float weight[QK5_1];
    uint8_t L[QK5_1], Laux[QK5_1];
    float sum_x2 = 0;
    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
    float sigma2 = sum_x2/n_per_row;
    const int nb = n_per_row/QK5_1;
    for (int ib = 0; ib < nb; ++ib) {
        const float * xb = x + QK5_1 * ib;
        const float * qw = quant_weights + QK5_1 * ib;
        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
        float min;
        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
        y[ib].d = GGML_FP32_TO_FP16(d);
        y[ib].m = GGML_FP32_TO_FP16(-min);
        uint32_t qh = 0;
        for (int j = 0; j < 16; ++j) {
            const uint8_t xi0 = L[j];
            const uint8_t xi1 = L[j+16];
            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
            // get the 5-th bit and store it in qh at the right position
            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
        }
        memcpy(&y[ib].qh, &qh, sizeof(qh));
    }
 }
 size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    if (!quant_weights) {
        return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
    }
    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += row_size;
    }
    return nrow * row_size;
 }
 // ====================== "True" 2-bit (de)-quantization
 static const  uint64_t iq2xxs_grid[256] = {
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -253,3 +253,7 @@ size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
--- a/ggml.c
+++ b/ggml.c
@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
        case GGML_TYPE_Q4_0:
            {
                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q4_1:
            {
                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q5_0:
            {
                GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q5_1:
            {
                GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
+                GGML_ASSERT(start % n_per_row == 0);
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_Q8_0:
            {
--- a/llama.cpp
+++ b/llama.cpp
@ -8374,6 +8374,8 @@ struct quantize_state_internal {
    int n_k_quantized     = 0;
    int n_fallback        = 0;
    bool has_imatrix      = false;
    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
        : model(model)
        , params(params)
@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
            new_type = GGML_TYPE_Q5_K;
        }
        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
                && qs.has_imatrix && i_layer < n_layer/8) {
            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
        }
        ++qs.i_feed_forward_w2;
    } else if (name.find("attn_output.weight") != std::string::npos) {
        if (arch != LLM_ARCH_FALCON) {
@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
        if (imatrix_data) {
            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
            qs.has_imatrix = true;
        }
    }