diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index a6620fd73..11fcbf443 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1799,7 +1799,7 @@ int main(int argc, char ** argv) { std::vector train_tokens; std::vector train_samples_begin; std::vector train_samples_size; - printf("%s: tokenize training data\n", __func__); + printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); tokenize_file(lctx, params.common.fn_train_data, params.common.sample_start, diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9a77beca6..b4fedf803 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -428,6 +428,7 @@ static std::vector hellaswag_evaluate_tokens( for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) { size_t n_tokens = tokens.size() - i_chunk * n_batch; n_tokens = std::min(n_tokens, size_t(n_batch)); + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return {}; diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py index a8a4919cf..cbf376652 100644 --- a/examples/pydantic-models-to-grammar-examples.py +++ b/examples/pydantic-models-to-grammar-examples.py @@ -1,5 +1,5 @@ # Function calling example using pydantic models. - +import datetime import json from enum import Enum from typing import Union, Optional @@ -8,7 +8,8 @@ import requests from pydantic import BaseModel, Field import importlib -from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation +from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function + # Function to get completion on the llama.cpp server with grammar. def create_completion(prompt, grammar): @@ -134,3 +135,121 @@ text = create_completion(prompt=prompt, grammar=gbnf_grammar) json_data = json.loads(text) print(Book(**json_data)) +# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition. + +def get_current_datetime(output_format: Optional[str] = None): + """ + Get the current date and time in the given format. + Args: + output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' + """ + if output_format is None: + output_format = '%Y-%m-%d %H:%M:%S' + return datetime.datetime.now().strftime(output_format) + + +# Enum for the calculator tool. +class MathOperation(Enum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" + + + +# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. +class Calculator(BaseModel): + """ + Perform a math operation on two numbers. + """ + number_one: Union[int, float] = Field(..., description="First number.") + operation: MathOperation = Field(..., description="Math operation to perform.") + number_two: Union[int, float] = Field(..., description="Second number.") + + def run(self): + if self.operation == MathOperation.ADD: + return self.number_one + self.number_two + elif self.operation == MathOperation.SUBTRACT: + return self.number_one - self.number_two + elif self.operation == MathOperation.MULTIPLY: + return self.number_one * self.number_two + elif self.operation == MathOperation.DIVIDE: + return self.number_one / self.number_two + else: + raise ValueError("Unknown operation.") + + +# Example function to get the weather +def get_current_weather(location, unit): + """Get the current weather in a given location""" + if "London" in location: + return json.dumps({"location": "London", "temperature": "42", "unit": unit.value}) + elif "New York" in location: + return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) + elif "North Pole" in location: + return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) + else: + return json.dumps({"location": location, "temperature": "unknown"}) + + +# Here is a function definition in OpenAI style +current_weather_tool = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, +} + +# Convert OpenAI function definition into pydantic model +current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) +# Add the actual function to a pydantic model +current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) + +# Convert normal Python function to a pydantic model +current_datetime_model = create_dynamic_model_from_function(get_current_datetime) + +tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + + +gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tool_list, outer_object_name="function", + outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) + +system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation + + +text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" +prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + +text = create_completion(prompt=prompt, grammar=gbnf_grammar) + +json_data = json.loads(text) + +print(json_data) +# Should output something like this: +# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] + + +for call in json_data: + if call["function"] == "Calculator": + print(Calculator(**call["params"]).run()) + elif call["function"] == "get_current_datetime": + print(current_datetime_model(**call["params"]).run()) + elif call["function"] == "get_current_weather": + print(current_weather_tool_model(**call["params"]).run()) +# Should output something like this: +# 2024-01-14 13:36:06 +# {"location": "London", "temperature": "42", "unit": "celsius"} +# 1764 diff --git a/flake.lock b/flake.lock index 15a0a1a8e..cd532ef4f 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1701473968, - "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=", + "lastModified": 1704982712, + "narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5", + "rev": "07f6395285469419cf9d078f59b5b49993198c00", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1703637592, - "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=", + "lastModified": 1705133751, + "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8", + "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d", "type": "github" }, "original": { @@ -37,11 +37,11 @@ "nixpkgs-lib": { "locked": { "dir": "lib", - "lastModified": 1701253981, - "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=", + "lastModified": 1703961334, + "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58", + "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 488ed6c59..ec62c773a 100644 --- a/flake.nix +++ b/flake.nix @@ -6,28 +6,41 @@ flake-parts.url = "github:hercules-ci/flake-parts"; }; - # Optional binary cache - nixConfig = { - extra-substituters = [ - # Populated by the CI in ggerganov/llama.cpp - "https://llama-cpp.cachix.org" - - # A development cache for nixpkgs imported with `config.cudaSupport = true`. - # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci. - # This lets one skip building e.g. the CUDA-enabled openmpi. - # TODO: Replace once nix-community obtains an official one. - "https://cuda-maintainers.cachix.org" - ]; - - # Verify these are the same keys as published on - # - https://app.cachix.org/cache/llama-cpp - # - https://app.cachix.org/cache/cuda-maintainers - extra-trusted-public-keys = [ - "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc=" - "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" - ]; - }; - + # There's an optional binary cache available. The details are below, but they're commented out. + # + # Why? The terrible experience of being prompted to accept them on every single Nix command run. + # Plus, there are warnings shown about not being a trusted user on a default Nix install + # if you *do* say yes to the prompts. + # + # This experience makes having `nixConfig` in a flake a persistent UX problem. + # + # To make use of the binary cache, please add the relevant settings to your `nix.conf`. + # It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings` + # option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`, + # as shown below. + # + # ``` + # nixConfig = { + # extra-substituters = [ + # # Populated by the CI in ggerganov/llama.cpp + # "https://llama-cpp.cachix.org" + # + # # A development cache for nixpkgs imported with `config.cudaSupport = true`. + # # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci. + # # This lets one skip building e.g. the CUDA-enabled openmpi. + # # TODO: Replace once nix-community obtains an official one. + # "https://cuda-maintainers.cachix.org" + # ]; + # + # # Verify these are the same keys as published on + # # - https://app.cachix.org/cache/llama-cpp + # # - https://app.cachix.org/cache/cuda-maintainers + # extra-trusted-public-keys = [ + # "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc=" + # "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" + # ]; + # }; + # ``` # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl: # diff --git a/ggml-quants.c b/ggml-quants.c index 0750fe1bb..31b053e33 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { quantize_row_q4_0_reference(x, y, k); } + void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { const int qk = QK4_1; @@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int return nrow * row_size; } +static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) { + static_assert(QK4_0 == 32, "QK4_0 must be 32"); + + if (!quant_weights) { + quantize_row_q4_0_reference(x, y, n_per_row); + return; + } + + float weight[QK4_0]; + int8_t L[QK4_0]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + float sigma2 = sum_x2/n_per_row; + + const int nb = n_per_row/QK4_0; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK4_0 * ib; + const float * qw = quant_weights + QK4_0 * ib; + for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight); + y[ib].d = GGML_FP32_TO_FP16(d); + for (int j = 0; j < 16; ++j) { + y[ib].qs[j] = L[j] | (L[j+16] << 4); + } + } +} + +size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + if (!quant_weights) { + return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist); + } + int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row); + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) { + static_assert(QK4_1 == 32, "QK4_1 must be 32"); + + if (!quant_weights) { + quantize_row_q4_1_reference(x, y, n_per_row); + return; + } + + float weight[QK4_1]; + uint8_t L[QK4_1], Laux[QK4_1]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + float sigma2 = sum_x2/n_per_row; + + const int nb = n_per_row/QK4_1; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK4_1 * ib; + const float * qw = quant_weights + QK4_1 * ib; + for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + float min; + float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false); + y[ib].d = GGML_FP32_TO_FP16(d); + y[ib].m = GGML_FP32_TO_FP16(-min); + for (int j = 0; j < 16; ++j) { + y[ib].qs[j] = L[j] | (L[j+16] << 4); + } + } +} + +size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + if (!quant_weights) { + return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist); + } + int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row); + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) { + static_assert(QK5_0 == 32, "QK5_0 must be 32"); + + if (!quant_weights) { + quantize_row_q5_0_reference(x, y, n_per_row); + return; + } + + float weight[QK5_0]; + int8_t L[QK5_0]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + float sigma2 = sum_x2/n_per_row; + + const int nb = n_per_row/QK5_0; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK5_0 * ib; + const float * qw = quant_weights + QK5_0 * ib; + for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight); + y[ib].d = GGML_FP32_TO_FP16(d); + + uint32_t qh = 0; + + for (int j = 0; j < 16; ++j) { + const uint8_t xi0 = L[j]; + const uint8_t xi1 = L[j+16]; + y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + + memcpy(&y[ib].qh, &qh, sizeof(qh)); + } +} + +size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + if (!quant_weights) { + return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist); + } + int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row); + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) { + static_assert(QK5_1 == 32, "QK5_1 must be 32"); + + if (!quant_weights) { + quantize_row_q5_1_reference(x, y, n_per_row); + return; + } + + float weight[QK5_1]; + uint8_t L[QK5_1], Laux[QK5_1]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + float sigma2 = sum_x2/n_per_row; + + const int nb = n_per_row/QK5_1; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK5_1 * ib; + const float * qw = quant_weights + QK5_1 * ib; + for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + float min; + float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false); + y[ib].d = GGML_FP32_TO_FP16(d); + y[ib].m = GGML_FP32_TO_FP16(-min); + + uint32_t qh = 0; + for (int j = 0; j < 16; ++j) { + const uint8_t xi0 = L[j]; + const uint8_t xi1 = L[j+16]; + y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + memcpy(&y[ib].qh, &qh, sizeof(qh)); + } +} + +size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + if (!quant_weights) { + return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist); + } + int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row); + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + // ====================== "True" 2-bit (de)-quantization static const uint64_t iq2xxs_grid[256] = { diff --git a/ggml-quants.h b/ggml-quants.h index 99467936a..d7fefdb54 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -253,3 +253,7 @@ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml.c b/ggml.c index 5779f32d2..d7e01b81f 100644 --- a/ggml.c +++ b/ggml.c @@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_Q4_0: { GGML_ASSERT(start % QK4_0 == 0); - block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; - result = ggml_quantize_q4_0(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q4_1: { GGML_ASSERT(start % QK4_1 == 0); - block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; - result = ggml_quantize_q4_1(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q5_0: { GGML_ASSERT(start % QK5_0 == 0); - block_q5_0 * block = (block_q5_0*)dst + start / QK5_0; - result = ggml_quantize_q5_0(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q5_1: { GGML_ASSERT(start % QK5_1 == 0); - block_q5_1 * block = (block_q5_1*)dst + start / QK5_1; - result = ggml_quantize_q5_1(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q8_0: { diff --git a/llama.cpp b/llama.cpp index 46c4d11c8..765d20ddb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8374,6 +8374,8 @@ struct quantize_state_internal { int n_k_quantized = 0; int n_fallback = 0; + bool has_imatrix = false; + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) @@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; } + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { @@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + qs.has_imatrix = true; } }