Merge branch 'master' into removing-extraneous-nil-check
This commit is contained in:
commit
7434324414
9 changed files with 393 additions and 42 deletions
|
@ -1799,7 +1799,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> train_tokens;
|
std::vector<llama_token> train_tokens;
|
||||||
std::vector<size_t> train_samples_begin;
|
std::vector<size_t> train_samples_begin;
|
||||||
std::vector<size_t> train_samples_size;
|
std::vector<size_t> train_samples_size;
|
||||||
printf("%s: tokenize training data\n", __func__);
|
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
|
||||||
tokenize_file(lctx,
|
tokenize_file(lctx,
|
||||||
params.common.fn_train_data,
|
params.common.fn_train_data,
|
||||||
params.common.sample_start,
|
params.common.sample_start,
|
||||||
|
|
|
@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
|
||||||
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
||||||
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
||||||
n_tokens = std::min(n_tokens, size_t(n_batch));
|
n_tokens = std::min(n_tokens, size_t(n_batch));
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return {};
|
return {};
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Function calling example using pydantic models.
|
# Function calling example using pydantic models.
|
||||||
|
import datetime
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
|
@ -8,7 +8,8 @@ import requests
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
|
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
|
||||||
|
|
||||||
|
|
||||||
# Function to get completion on the llama.cpp server with grammar.
|
# Function to get completion on the llama.cpp server with grammar.
|
||||||
def create_completion(prompt, grammar):
|
def create_completion(prompt, grammar):
|
||||||
|
@ -134,3 +135,121 @@ text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||||
json_data = json.loads(text)
|
json_data = json.loads(text)
|
||||||
|
|
||||||
print(Book(**json_data))
|
print(Book(**json_data))
|
||||||
|
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
|
||||||
|
|
||||||
|
def get_current_datetime(output_format: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Get the current date and time in the given format.
|
||||||
|
Args:
|
||||||
|
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
||||||
|
"""
|
||||||
|
if output_format is None:
|
||||||
|
output_format = '%Y-%m-%d %H:%M:%S'
|
||||||
|
return datetime.datetime.now().strftime(output_format)
|
||||||
|
|
||||||
|
|
||||||
|
# Enum for the calculator tool.
|
||||||
|
class MathOperation(Enum):
|
||||||
|
ADD = "add"
|
||||||
|
SUBTRACT = "subtract"
|
||||||
|
MULTIPLY = "multiply"
|
||||||
|
DIVIDE = "divide"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||||
|
class Calculator(BaseModel):
|
||||||
|
"""
|
||||||
|
Perform a math operation on two numbers.
|
||||||
|
"""
|
||||||
|
number_one: Union[int, float] = Field(..., description="First number.")
|
||||||
|
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||||
|
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
if self.operation == MathOperation.ADD:
|
||||||
|
return self.number_one + self.number_two
|
||||||
|
elif self.operation == MathOperation.SUBTRACT:
|
||||||
|
return self.number_one - self.number_two
|
||||||
|
elif self.operation == MathOperation.MULTIPLY:
|
||||||
|
return self.number_one * self.number_two
|
||||||
|
elif self.operation == MathOperation.DIVIDE:
|
||||||
|
return self.number_one / self.number_two
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown operation.")
|
||||||
|
|
||||||
|
|
||||||
|
# Example function to get the weather
|
||||||
|
def get_current_weather(location, unit):
|
||||||
|
"""Get the current weather in a given location"""
|
||||||
|
if "London" in location:
|
||||||
|
return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
|
||||||
|
elif "New York" in location:
|
||||||
|
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
||||||
|
elif "North Pole" in location:
|
||||||
|
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
||||||
|
else:
|
||||||
|
return json.dumps({"location": location, "temperature": "unknown"})
|
||||||
|
|
||||||
|
|
||||||
|
# Here is a function definition in OpenAI style
|
||||||
|
current_weather_tool = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert OpenAI function definition into pydantic model
|
||||||
|
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
||||||
|
# Add the actual function to a pydantic model
|
||||||
|
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
||||||
|
|
||||||
|
# Convert normal Python function to a pydantic model
|
||||||
|
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
||||||
|
|
||||||
|
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
||||||
|
|
||||||
|
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
|
pydantic_model_list=tool_list, outer_object_name="function",
|
||||||
|
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
||||||
|
|
||||||
|
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
||||||
|
|
||||||
|
|
||||||
|
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
|
||||||
|
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||||
|
|
||||||
|
json_data = json.loads(text)
|
||||||
|
|
||||||
|
print(json_data)
|
||||||
|
# Should output something like this:
|
||||||
|
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
|
||||||
|
|
||||||
|
|
||||||
|
for call in json_data:
|
||||||
|
if call["function"] == "Calculator":
|
||||||
|
print(Calculator(**call["params"]).run())
|
||||||
|
elif call["function"] == "get_current_datetime":
|
||||||
|
print(current_datetime_model(**call["params"]).run())
|
||||||
|
elif call["function"] == "get_current_weather":
|
||||||
|
print(current_weather_tool_model(**call["params"]).run())
|
||||||
|
# Should output something like this:
|
||||||
|
# 2024-01-14 13:36:06
|
||||||
|
# {"location": "London", "temperature": "42", "unit": "celsius"}
|
||||||
|
# 1764
|
||||||
|
|
18
flake.lock
generated
18
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1701473968,
|
"lastModified": 1704982712,
|
||||||
"narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
|
"narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
|
"rev": "07f6395285469419cf9d078f59b5b49993198c00",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1703637592,
|
"lastModified": 1705133751,
|
||||||
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
|
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
|
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -37,11 +37,11 @@
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "lib",
|
"dir": "lib",
|
||||||
"lastModified": 1701253981,
|
"lastModified": 1703961334,
|
||||||
"narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
|
"narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
|
"rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
57
flake.nix
57
flake.nix
|
@ -6,28 +6,41 @@
|
||||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Optional binary cache
|
# There's an optional binary cache available. The details are below, but they're commented out.
|
||||||
nixConfig = {
|
#
|
||||||
extra-substituters = [
|
# Why? The terrible experience of being prompted to accept them on every single Nix command run.
|
||||||
# Populated by the CI in ggerganov/llama.cpp
|
# Plus, there are warnings shown about not being a trusted user on a default Nix install
|
||||||
"https://llama-cpp.cachix.org"
|
# if you *do* say yes to the prompts.
|
||||||
|
#
|
||||||
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
# This experience makes having `nixConfig` in a flake a persistent UX problem.
|
||||||
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
#
|
||||||
# This lets one skip building e.g. the CUDA-enabled openmpi.
|
# To make use of the binary cache, please add the relevant settings to your `nix.conf`.
|
||||||
# TODO: Replace once nix-community obtains an official one.
|
# It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
|
||||||
"https://cuda-maintainers.cachix.org"
|
# option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
|
||||||
];
|
# as shown below.
|
||||||
|
#
|
||||||
# Verify these are the same keys as published on
|
# ```
|
||||||
# - https://app.cachix.org/cache/llama-cpp
|
# nixConfig = {
|
||||||
# - https://app.cachix.org/cache/cuda-maintainers
|
# extra-substituters = [
|
||||||
extra-trusted-public-keys = [
|
# # Populated by the CI in ggerganov/llama.cpp
|
||||||
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
# "https://llama-cpp.cachix.org"
|
||||||
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
#
|
||||||
];
|
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||||
};
|
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||||
|
# # This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||||
|
# # TODO: Replace once nix-community obtains an official one.
|
||||||
|
# "https://cuda-maintainers.cachix.org"
|
||||||
|
# ];
|
||||||
|
#
|
||||||
|
# # Verify these are the same keys as published on
|
||||||
|
# # - https://app.cachix.org/cache/llama-cpp
|
||||||
|
# # - https://app.cachix.org/cache/cuda-maintainers
|
||||||
|
# extra-trusted-public-keys = [
|
||||||
|
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||||
|
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||||
|
# ];
|
||||||
|
# };
|
||||||
|
# ```
|
||||||
|
|
||||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||||
#
|
#
|
||||||
|
|
192
ggml-quants.c
192
ggml-quants.c
|
@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||||
quantize_row_q4_0_reference(x, y, k);
|
quantize_row_q4_0_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
||||||
const int qk = QK4_1;
|
const int qk = QK4_1;
|
||||||
|
|
||||||
|
@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
|
||||||
return nrow * row_size;
|
return nrow * row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
|
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||||
|
|
||||||
|
if (!quant_weights) {
|
||||||
|
quantize_row_q4_0_reference(x, y, n_per_row);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float weight[QK4_0];
|
||||||
|
int8_t L[QK4_0];
|
||||||
|
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||||
|
float sigma2 = sum_x2/n_per_row;
|
||||||
|
|
||||||
|
const int nb = n_per_row/QK4_0;
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
const float * xb = x + QK4_0 * ib;
|
||||||
|
const float * qw = quant_weights + QK4_0 * ib;
|
||||||
|
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||||
|
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
|
||||||
|
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||||
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
if (!quant_weights) {
|
||||||
|
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||||
|
}
|
||||||
|
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += row_size;
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
|
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||||
|
|
||||||
|
if (!quant_weights) {
|
||||||
|
quantize_row_q4_1_reference(x, y, n_per_row);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float weight[QK4_1];
|
||||||
|
uint8_t L[QK4_1], Laux[QK4_1];
|
||||||
|
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||||
|
float sigma2 = sum_x2/n_per_row;
|
||||||
|
|
||||||
|
const int nb = n_per_row/QK4_1;
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
const float * xb = x + QK4_1 * ib;
|
||||||
|
const float * qw = quant_weights + QK4_1 * ib;
|
||||||
|
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||||
|
float min;
|
||||||
|
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
||||||
|
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||||
|
y[ib].m = GGML_FP32_TO_FP16(-min);
|
||||||
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
if (!quant_weights) {
|
||||||
|
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||||
|
}
|
||||||
|
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += row_size;
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
|
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
||||||
|
|
||||||
|
if (!quant_weights) {
|
||||||
|
quantize_row_q5_0_reference(x, y, n_per_row);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float weight[QK5_0];
|
||||||
|
int8_t L[QK5_0];
|
||||||
|
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||||
|
float sigma2 = sum_x2/n_per_row;
|
||||||
|
|
||||||
|
const int nb = n_per_row/QK5_0;
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
const float * xb = x + QK5_0 * ib;
|
||||||
|
const float * qw = quant_weights + QK5_0 * ib;
|
||||||
|
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||||
|
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
|
||||||
|
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
|
uint32_t qh = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
const uint8_t xi0 = L[j];
|
||||||
|
const uint8_t xi1 = L[j+16];
|
||||||
|
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
||||||
|
|
||||||
|
// get the 5-th bit and store it in qh at the right position
|
||||||
|
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||||
|
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
if (!quant_weights) {
|
||||||
|
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||||
|
}
|
||||||
|
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += row_size;
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
|
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
||||||
|
|
||||||
|
if (!quant_weights) {
|
||||||
|
quantize_row_q5_1_reference(x, y, n_per_row);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float weight[QK5_1];
|
||||||
|
uint8_t L[QK5_1], Laux[QK5_1];
|
||||||
|
|
||||||
|
float sum_x2 = 0;
|
||||||
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||||
|
float sigma2 = sum_x2/n_per_row;
|
||||||
|
|
||||||
|
const int nb = n_per_row/QK5_1;
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
const float * xb = x + QK5_1 * ib;
|
||||||
|
const float * qw = quant_weights + QK5_1 * ib;
|
||||||
|
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||||
|
float min;
|
||||||
|
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
||||||
|
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||||
|
y[ib].m = GGML_FP32_TO_FP16(-min);
|
||||||
|
|
||||||
|
uint32_t qh = 0;
|
||||||
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
const uint8_t xi0 = L[j];
|
||||||
|
const uint8_t xi1 = L[j+16];
|
||||||
|
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
||||||
|
// get the 5-th bit and store it in qh at the right position
|
||||||
|
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||||
|
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||||
|
}
|
||||||
|
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
if (!quant_weights) {
|
||||||
|
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||||
|
}
|
||||||
|
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += row_size;
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== "True" 2-bit (de)-quantization
|
// ====================== "True" 2-bit (de)-quantization
|
||||||
|
|
||||||
static const uint64_t iq2xxs_grid[256] = {
|
static const uint64_t iq2xxs_grid[256] = {
|
||||||
|
|
|
@ -253,3 +253,7 @@ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row,
|
||||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
|
28
ggml.c
28
ggml.c
|
@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_0 == 0);
|
GGML_ASSERT(start % QK4_0 == 0);
|
||||||
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_1 == 0);
|
GGML_ASSERT(start % QK4_1 == 0);
|
||||||
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK5_0 == 0);
|
GGML_ASSERT(start % QK5_0 == 0);
|
||||||
block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q5_0(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK5_1 == 0);
|
GGML_ASSERT(start % QK5_1 == 0);
|
||||||
block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q5_1(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
|
|
10
llama.cpp
10
llama.cpp
|
@ -8374,6 +8374,8 @@ struct quantize_state_internal {
|
||||||
int n_k_quantized = 0;
|
int n_k_quantized = 0;
|
||||||
int n_fallback = 0;
|
int n_fallback = 0;
|
||||||
|
|
||||||
|
bool has_imatrix = false;
|
||||||
|
|
||||||
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||||
: model(model)
|
: model(model)
|
||||||
, params(params)
|
, params(params)
|
||||||
|
@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
||||||
new_type = GGML_TYPE_Q5_K;
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
||||||
|
&& qs.has_imatrix && i_layer < n_layer/8) {
|
||||||
|
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
||||||
|
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
||||||
|
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
||||||
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
||||||
|
}
|
||||||
++qs.i_feed_forward_w2;
|
++qs.i_feed_forward_w2;
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
if (arch != LLM_ARCH_FALCON) {
|
if (arch != LLM_ARCH_FALCON) {
|
||||||
|
@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
||||||
if (imatrix_data) {
|
if (imatrix_data) {
|
||||||
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
||||||
|
qs.has_imatrix = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue