This commit is contained in:
themanyone 2024-07-22 13:56:52 -08:00
commit 8ea2402195
37 changed files with 1664 additions and 1401 deletions

View file

@ -860,7 +860,7 @@ jobs:
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
- name: Determine tag name
id: tag

View file

@ -3,7 +3,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

View file

@ -48,7 +48,7 @@ class Model:
dir_model: Path
ftype: gguf.LlamaFileType
fname_out: Path | None
fname_out: Path
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
@ -62,11 +62,12 @@ class Model:
gguf_writer: gguf.GGUFWriter
model_name: str | None
metadata_override: Path | None
dir_model_card: Path
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@ -90,6 +91,7 @@ class Model:
self.tensor_names = None
self.metadata_override = metadata_override
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED:
@ -237,6 +239,10 @@ class Model:
self.gguf_writer.add_expert_used_count(n_experts_used)
logger.info(f"gguf: experts used count = {n_experts_used}")
if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}")
@ -345,7 +351,7 @@ class Model:
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
# Fallback to model directory name if metadata name is still missing
if self.metadata.name is None:
@ -359,27 +365,22 @@ class Model:
output_type: str = self.ftype.name.partition("_")[2]
# Filename Output
# Note: `not is_dir()` is used because `.is_file()` will not detect
# file template strings as it doesn't actually exist as a file
if self.fname_out is not None and not self.fname_out.is_dir():
# Output path is a custom defined templated filename
# Process templated file name with the output ftype, useful with the "auto" ftype
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
else:
if self.fname_out.is_dir():
# Generate default filename based on model specification and available metadata
if not vocab_only:
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
else:
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
# Check if preferred output directory path was provided
if self.fname_out is not None and self.fname_out.is_dir():
# output path is a directory
self.fname_out = self.fname_out / f"{fname_default}.gguf"
else:
# output in the same directory as the model by default
self.fname_out = self.dir_model / f"{fname_default}.gguf"
# Use the default filename
self.fname_out = self.fname_out / f"{fname_default}.gguf"
else:
# Output path is a custom defined templated filename
# Note: `not is_dir()` is used because `.is_file()` will not detect
# file template strings as it doesn't actually exist as a file
# Process templated file name with the output ftype, useful with the "auto" ftype
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
self.set_type()
@ -593,6 +594,15 @@ class Model:
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b
res = "jais"
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
res = "codeshell"
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
res = "tekken"
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
res = "smollm"
if res is None:
logger.warning("\n")
@ -733,7 +743,7 @@ class Model:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
@ -750,7 +760,8 @@ class Model:
token_id = int(token_id)
token: str = token_data["content"]
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token.encode("utf-8")
if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
else:
@ -1309,6 +1320,7 @@ class RefactModel(Model):
special_vocab._set_special_token("prefix", 1)
special_vocab._set_special_token("suffix", 3)
special_vocab._set_special_token("middle", 2)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
@ -1479,7 +1491,12 @@ class LlamaModel(Model):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
@ -1994,7 +2011,7 @@ class Phi3MiniModel(Model):
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
@ -2011,7 +2028,8 @@ class Phi3MiniModel(Model):
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token
if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2027,7 +2045,8 @@ class Phi3MiniModel(Model):
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token
if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2068,7 +2087,7 @@ class Phi3MiniModel(Model):
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if (rope_scaling is None):
if rope_scaling is None:
return
scale = max_pos_embds / orig_max_pos_embds
@ -2266,7 +2285,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id
token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token)
if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2285,7 +2305,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id
token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token)
if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2471,6 +2492,7 @@ class GemmaModel(Model):
special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
@ -2712,7 +2734,7 @@ class JinaBertV2Model(BertModel):
yield name, data
def set_vocab(self, *args, **kwargs):
def set_vocab(self):
tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_class = json.load(f)['tokenizer_class']
@ -2860,7 +2882,7 @@ class ArcticModel(Model):
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id)
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
@ -3109,7 +3131,7 @@ class T5Model(Model):
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
@ -3420,7 +3442,6 @@ class ChatGLMModel(Model):
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
@ -3625,10 +3646,10 @@ def main() -> None:
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)
fname_out = None
if args.outfile is not None:
fname_out = args.outfile
else:
fname_out = dir_model
logger.info(f"Loading model: {dir_model.name}")
@ -3659,7 +3680,6 @@ def main() -> None:
else:
logger.info("Exporting model...")
model_instance.write()
assert model_instance.fname_out is not None
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
logger.info(f"Model successfully exported to {out_path}")

View file

@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
@ -91,6 +91,9 @@ models = [
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
]
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
@ -159,7 +162,7 @@ for model in models:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt)
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
@ -191,7 +194,7 @@ src_func = f"""
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktxt = {repr(CHK_TXT)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -287,7 +290,7 @@ tests = [
"333333333",
"Cửa Việt", # llama-bpe fails on this
" discards",
chktxt,
CHK_TXT,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp

View file

@ -132,6 +132,10 @@ class Tensor:
class GGMLModel:
file_format: GGMLFormat
format_version: int
def __init__(self):
self.hyperparameters = None
self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)

View file

@ -290,7 +290,7 @@ if __name__ == '__main__':
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
fname_out = dir_lora
if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
@ -304,12 +304,6 @@ if __name__ == '__main__':
# load base model
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
@ -320,12 +314,21 @@ if __name__ == '__main__':
class LoraModel(model_class):
model_arch = model_class.model_arch
lora_alpha: float
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
super().__init__(*args, **kwargs)
self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha)
def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@ -368,6 +371,11 @@ if __name__ == '__main__':
yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
model_instance = LoraModel(
dir_base_model,
ftype,
@ -376,6 +384,8 @@ if __name__ == '__main__':
use_temp_file=False,
eager=args.no_lazy,
dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
)
logger.info("Exporting model...")

View file

@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

View file

@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF("");
return nullptr;
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);

View file

@ -26,11 +26,12 @@ actor LlamaContext {
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
var is_done: Bool = false
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 64
var n_len: Int32 = 1024
var n_cur: Int32 = 0
var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str

View file

@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
messageLog += "\(text)"
Task.detached {
while await llamaContext.n_cur < llamaContext.n_len {
while await !llamaContext.is_done {
let result = await llamaContext.completion_loop()
await MainActor.run {
self.messageLog += "\(result)"

343
examples/pydantic_models_to_grammar_examples.py Normal file → Executable file
View file

@ -1,8 +1,15 @@
# Function calling example using pydantic models.
#!/usr/bin/env python3
"""Function calling example using pydantic models."""
from __future__ import annotations
import argparse
import datetime
import json
import logging
import textwrap
import sys
from enum import Enum
from typing import Optional, Union
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
# Function to get completion on the llama.cpp server with grammar.
def create_completion(prompt, grammar):
def create_completion(host, prompt, gbnf_grammar):
"""Calls the /completion API on llama-server.
See
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
"""
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
headers = {"Content-Type": "application/json"}
data = {"prompt": prompt, "grammar": grammar}
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
data = response.json()
data = {"prompt": prompt, "grammar": gbnf_grammar}
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
assert data.get("error") is None, data
print(data["content"])
return data["content"]
logging.info("Result: %s", result)
content = result["content"]
print(f" Model: {result['model']}")
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
return content
# A function for the agent to send a message to the user.
class SendMessageToUser(BaseModel):
"""
Send a message to the User.
"""
"""Send a message to the User."""
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
message: str = Field(..., description="Message you want to send to the user.")
def run(self):
print(self.message)
print(f"SendMessageToUser: {self.message}")
def example_rce(host):
"""Minimal test case where the LLM call an arbitrary python function."""
print("- example_rce")
tools = [SendMessageToUser]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
tools_map = {tool.__name__:tool for tool in tools}
# This finds "SendMessageToUser":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
tool(**json_data["function_parameters"]).run()
return 0
# Enum for the calculator tool.
@ -46,11 +77,11 @@ class MathOperation(Enum):
DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
# Simple pydantic calculator tool for the agent that can add, subtract,
# multiply, and divide. Docstring and description of fields will be used in
# system prompt.
class Calculator(BaseModel):
"""
Perform a math operation on two numbers.
"""
"""Perform a math operation on two numbers."""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
@ -68,55 +99,61 @@ class Calculator(BaseModel):
raise ValueError("Unknown operation.")
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
# pydantic_model_list is the list of pydanitc models
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
# outer_object_content is the name of outer object content.
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
def example_calculator(host):
"""Have the LLM ask to get a calculation done.
print(gbnf_grammar)
print(documentation)
Here the grammar gets generated by passing the available function models to
generate_gbnf_grammar_and_documentation function. This also generates a
documentation usable by the LLM.
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
pydantic_model_list is the list of pydantic models outer_object_name is an
optional name for an outer object around the actual model object. Like a
"function" object with "function_parameters" which contains the actual model
object. If None, no outer object will be generated outer_object_content is
the name of outer object content.
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
# This should output something like this:
# {
# "function": "calculator",
# "function_parameters": {
# "number_one": 42,
# "operation": "multiply",
# "number_two": 42
# }
# }
function_dictionary = json.loads(text)
if function_dictionary["function"] == "calculator":
function_parameters = {**function_dictionary["function_parameters"]}
print(Calculator(**function_parameters).run())
# This should output: 1764
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
"""
print("- example_calculator")
tools = [SendMessageToUser, Calculator]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message1 = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
expected = {
"function": "Calculator",
"function_parameters": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
if json_data != expected:
print(" Result is not as expected!")
tools_map = {tool.__name__:tool for tool in tools}
# This finds "Calculator":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
result = tool(**json_data["function_parameters"]).run()
print(f" Call {json_data['function']} gave result {result}")
return 0
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
class Category(Enum):
"""
The category of the book.
"""
"""The category of the book."""
Fiction = "Fiction"
NonFiction = "Non-Fiction"
class Book(BaseModel):
"""
Represents an entry about a book.
"""
"""Represents an entry about a book."""
title: str = Field(..., description="Title of the book.")
author: str = Field(..., description="Author of the book.")
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@ -125,33 +162,42 @@ class Book(BaseModel):
summary: str = Field(..., description="Summary of the book.")
# We need no additional parameters other than our list of pydantic models.
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
def example_struct(host):
"""A example structured output based on pydantic models.
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
The LLM will create an entry for a Book database out of an unstructured
text. We need no additional parameters other than our list of pydantic
models.
"""
print("- example_struct")
tools = [Book]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
# In this case, there's no function nor function_parameters.
# Here the result will vary based on the LLM used.
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
if keys != sorted(json_data.keys()):
print(f"Unexpected result: {sorted(json_data.keys())}")
return 1
book = Book(**json_data)
print(f" As a Book object: %s" % book)
return 0
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text)
print(Book(**json_data))
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
def get_current_datetime(output_format: Optional[str] = None):
"""
Get the current date and time in the given format.
"""Get the current date and time in the given format.
Args:
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
"""
if output_format is None:
output_format = '%Y-%m-%d %H:%M:%S'
return datetime.datetime.now().strftime(output_format)
return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
# Example function to get the weather
# Example function to get the weather.
def get_current_weather(location, unit):
"""Get the current weather in a given location"""
if "London" in location:
@ -160,68 +206,107 @@ def get_current_weather(location, unit):
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
elif "North Pole" in location:
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
else:
return json.dumps({"location": location, "temperature": "unknown"})
return json.dumps({"location": location, "temperature": "unknown"})
# Here is a function definition in OpenAI style
current_weather_tool = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
def example_concurrent(host):
"""An example for parallel function calling with a Python function, a pydantic
function model and an OpenAI like function definition.
"""
print("- example_concurrent")
# Function definition in OpenAI style.
current_weather_tool = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
"required": ["location"],
},
"required": ["location"],
},
},
}
}
# Convert OpenAI function definition into pydantic model.
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model.
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert OpenAI function definition into pydantic model
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert normal Python function to a pydantic model.
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
# Convert normal Python function to a pydantic model
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
expected = [
{
"function": "get_current_datetime",
"params": {
"output_format": "%Y-%m-%d %H:%M:%S"
}
},
{
"function": "get_current_weather",
"params": {
"location": "London",
"unit": "celsius"
}
},
{
"function": "Calculator",
"params": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
]
res = 0
if json_data != expected:
print(" Result is not as expected!")
print(" This can happen on highly quantized models")
res = 1
tools_map = {tool.__name__:tool for tool in tools}
for call in json_data:
tool = tools_map.get(call["function"])
if not tool:
print(f"Error: unknown tool {call['function']}")
return 1
result = tool(**call["params"]).run()
print(f" Call {call['function']} returned {result}")
# Should output something like this:
# Call get_current_datetime returned 2024-07-15 09:50:38
# Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
# Call Calculator returned 1764
return res
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tool_list, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
def main():
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
ret = 0
# Comment out below to only run the example you want.
ret = ret or example_rce(args.host)
ret = ret or example_calculator(args.host)
ret = ret or example_struct(args.host)
ret = ret or example_concurrent(args.host)
return ret
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text)
print(json_data)
# Should output something like this:
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
for call in json_data:
if call["function"] == "Calculator":
print(Calculator(**call["params"]).run())
elif call["function"] == "get_current_datetime":
print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
elif call["function"] == "get_current_weather":
print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
# Should output something like this:
# 2024-01-14 13:36:06
# {"location": "London", "temperature": "42", "unit": "celsius"}
# 1764
if __name__ == "__main__":
sys.exit(main())

View file

@ -444,7 +444,7 @@ node index.js
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

6
flake.lock generated
View file

@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1720768451,
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
"lastModified": 1721379653,
"narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
"rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
"type": "github"
},
"original": {

View file

@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
case GGML_TYPE_Q6_K:
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
break;
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
mmq_supported = true;

File diff suppressed because it is too large Load diff

View file

@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
TYPES_MMQ = [
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
]
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);

View file

@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
}
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
const int * v, const int * u, const float * d8_0, const float & d8_1) {
float sumf = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
int sumi = 0;
#pragma unroll
for (int i = i0; i < i0 + QI8_0/2; ++i) {
// SIMD dot product of quantized values
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
sumf += d8_0[i0/(QI8_0/2)]*sumi;
}
return d8_1*sumf;
}
#define VDR_Q2_K_Q8_1_MMVQ 1
#define VDR_Q2_K_Q8_1_MMQ 4

View file

@ -1786,10 +1786,6 @@ static enum ggml_status ggml_metal_graph_compute(
}
};
if (ggml_is_quantized(src0t)) {
GGML_ASSERT(ne00 >= nth0*nth1);
}
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];

View file

@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device const float4 * y4 = (device const float4 *)yb;
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
device const block_iq4_nl & xb = x[row*nb + ib];
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
yb += 16 * QK4_NL;
}
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;

File diff suppressed because it is too large Load diff

View file

@ -19019,7 +19019,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
FILE * fout = ggml_fopen(fname, "wb");
if (!fout) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
return;
}
@ -19156,7 +19156,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
{
FILE * fin = ggml_fopen(fname, "rb");
if (!fin) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
return result;
}
@ -20830,6 +20830,7 @@ struct gguf_context * gguf_init_empty(void) {
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");
if (!file) {
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
return NULL;
}
@ -21014,7 +21015,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) {
for (uint64_t j = 0; j < i && ok; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;

View file

@ -54,6 +54,7 @@ class Metadata:
model_card = Metadata.load_model_card(model_path)
hf_params = Metadata.load_hf_parameters(model_path)
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
# heuristics
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
@ -62,6 +63,7 @@ class Metadata:
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata_override = Metadata.load_metadata_override(metadata_override_path)
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization)
@ -176,6 +178,12 @@ class Metadata:
org_component = None
name_parts: list[str] = model_full_name_component.split('-')
# Remove empty parts
for i in reversed(range(len(name_parts))):
if len(name_parts[i]) == 0:
del name_parts[i]
name_types: list[
set[Literal["basename", "size_label", "finetune", "version", "type"]]
] = [set() for _ in name_parts]
@ -222,9 +230,19 @@ class Metadata:
name_parts[i] = part
# Some easy to recognize finetune names
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
name_types[i].add("finetune")
if part.lower() == "lora":
name_parts[i] = "LoRA"
if total_params < 0 and part.lower() == "lora":
# ignore redundant "lora" in the finetune part when the output is a lora adapter
name_types[i].add("type")
else:
name_types[i].add("finetune")
# Ignore word-based size labels when there is at least a number-based one present
# TODO: should word-based size labels always be removed instead?
if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
for n, t in zip(name_parts, name_types):
if "size_label" in t:
if all(c.isalpha() for c in n):
t.remove("size_label")
at_start = True
# Find the basename through the annotated name
@ -239,18 +257,18 @@ class Metadata:
# Remove the basename annotation from trailing version
for part, t in zip(reversed(name_parts), reversed(name_types)):
if "basename" in t:
if len(t) > 1:
t.remove("basename")
if "basename" in t and len(t) > 1:
t.remove("basename")
else:
break
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
# Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
# TODO: should the basename version always be excluded?
# TODO: should multiple versions be joined together?
version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
# NOTE: multiple finetune versions are joined together
version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
if size_label is None and finetune is None and version is None:
# Too ambiguous, output nothing

View file

@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
if base_name is not None:
name = base_name.strip().title().replace(' ', '-').replace('/', '-')
name = base_name.strip().replace(' ', '-').replace('/', '-')
elif model_name is not None:
name = model_name.strip().title().replace(' ', '-').replace('/', '-')
name = model_name.strip().replace(' ', '-').replace('/', '-')
else:
name = "ggml-model"
parameters = f"-{size_label}" if size_label is not None else ""
finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""

View file

@ -4,6 +4,7 @@ from __future__ import annotations
import logging
import argparse
import os
import re
import sys
from pathlib import Path
from typing import Any
@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
else:
pretty_type = str(field.types[-1].name)
def escape_markdown_inline_code(value_string):
# Find the longest contiguous sequence of backticks in the string then
# wrap string with appropriate number of backticks required to escape it
max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
inline_code_marker = '`' * (max_backticks + 1)
# If the string starts or ends with a backtick, add a space at the beginning and end
if value_string.startswith('`') or value_string.endswith('`'):
value_string = f" {value_string} "
return f"{inline_code_marker}{value_string}{inline_code_marker}"
total_elements = len(field.data)
value = ""
if len(field.types) == 1:
curr_type = field.types[0]
if curr_type == GGUFValueType.STRING:
value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
truncate_length = 60
value_string = str(bytes(field.parts[-1]), encoding='utf-8')
if len(value_string) > truncate_length:
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
value = "{head}...{tail}".format(head=head, tail=tail)
else:
value = escape_markdown_inline_code(value_string)
elif curr_type in reader.gguf_scalar_to_np:
value = str(field.parts[-1][0])
else:
if field.types[0] == GGUFValueType.ARRAY:
curr_type = field.types[1]
array_elements = []
if curr_type == GGUFValueType.STRING:
render_element = min(5, total_elements)
for element_pos in range(render_element):
value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
truncate_length = 30
value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
if len(value_string) > truncate_length:
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
value = "{head}...{tail}".format(head=head, tail=tail)
else:
value = escape_markdown_inline_code(value_string)
array_elements.append(value)
elif curr_type in reader.gguf_scalar_to_np:
render_element = min(7, total_elements)
for element_pos in range(render_element):
value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
kv_dump_table_header_map = [
@ -382,7 +415,7 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n"
markdown_content += "\n\n"
print(markdown_content) # noqa: NP100
print(markdown_content) # noqa: NP100
def main() -> None:

View file

@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
# Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
# Non standard naming
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
# None standard and not easy to disambiguate
# Non standard and not easy to disambiguate
self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
# Ignore full-text size labels when there are number-based ones, and deduplicate size labels
self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
# Instruct in a name without a size label
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
# Non-obvious splitting relying on 'chat' keyword
self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
# Multiple versions
self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
# TODO: DPO in the name
self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
# DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
# Too ambiguous
# TODO: should "base" be a 'finetune' or 'size_label'?
# (in this case it should be a size label, but other models use it to signal that they are not finetuned)
self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
('Florence-2-base', 'microsoft', None, None, None, None))
## Invalid cases ##
# Start with a dash and has dashes in rows
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
## LoRA ##
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
# Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
def test_apply_metadata_heuristic_from_model_card(self):
model_card = {
'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
}
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
expect = gguf.Metadata()
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
expect.languages=['en']
expect.datasets=['teknium/OpenHermes-2.5']

View file

@ -40,7 +40,7 @@
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 6
#define LLAMA_SESSION_VERSION 7
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 1
@ -92,6 +92,9 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
};
// note: these values should be synchronized with ggml_rope

Binary file not shown.

Binary file not shown.

View file

@ -114,7 +114,7 @@
// bump if necessary
#define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_LAYERS 256
#define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
//
@ -3707,7 +3707,7 @@ struct llama_model_loader {
}
if (param_overrides_p != nullptr) {
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
kv_overrides.insert({std::string(p->key), *p});
}
}
@ -3875,7 +3875,7 @@ struct llama_model_loader {
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
{
const int kid = gguf_find_key(meta, "general.file_type");
const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
if (kid >= 0) {
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
}
@ -4007,7 +4007,9 @@ struct llama_model_loader {
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
}
GGML_ASSERT(arr_info.length <= N_MAX);
if (arr_info.length > N_MAX) {
throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
}
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
@ -4043,8 +4045,6 @@ struct llama_model_loader {
// get array of n <= N_MAX elements, or a single element repeated n times
template<typename T, size_t N_MAX>
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
GGML_ASSERT(n <= N_MAX);
const int kid = gguf_find_key(meta, key.c_str());
if (kid < 0) {
@ -4054,6 +4054,10 @@ struct llama_model_loader {
return false;
}
if (n > N_MAX) {
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
}
if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
@ -5003,7 +5007,7 @@ static void llm_load_hparams(
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 42: model.type = e_model::MODEL_SMALL; break;
case 42: model.type = e_model::MODEL_7B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
@ -5365,6 +5369,7 @@ static void llm_load_vocab(
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
@ -5403,16 +5408,6 @@ static void llm_load_vocab(
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
} // The default value of add_space_prefix is true.
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
if (remove_extra_whitespaces_keyidx != -1) {
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
} // The default value of remove_extra_whitespaces is false.
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@ -5520,6 +5515,19 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if (
tokenizer_pre == "tekken") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "smollm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -5543,10 +5551,8 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
}
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
}
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@ -6127,10 +6133,10 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@ -15544,6 +15550,8 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@ -15581,6 +15589,13 @@ struct llm_tokenizer_bpe {
"\\p{N}",
};
break;
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
// original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
regex_exprs = {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -18271,8 +18286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// copy the KV pairs from the input file
gguf_set_kv (ctx_out, ml.meta);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
// Remove split metadata
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@ -19435,7 +19451,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
case LLM_ARCH_CODESHELL:
case LLM_ARCH_ORION:
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_MINICPM:
@ -19465,6 +19480,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_STARCODER2:
case LLM_ARCH_OPENELM:
case LLM_ARCH_GPTNEOX:
case LLM_ARCH_CODESHELL:
return LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here
@ -19920,7 +19936,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
);
// on session change it is very likely that the state size has changed - so we need to update this function
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
return s_total;
}
@ -21607,7 +21623,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);

View file

@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
target_link_libraries(test-tokenizer-0 PRIVATE common)
install(TARGETS test-tokenizer-0 RUNTIME)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
# TODO: enable when fixed
# https://github.com/ggerganov/llama.cpp/pull/7036
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
install(TARGETS test-tokenizer-1-bpe RUNTIME)
# TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
# build test-tokenizer-1-spm target once and add many tests
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)

View file

@ -79,8 +79,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
im = nullptr;
}
}
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
// TODO: other cases
//#pragma omp parallel for
//for (int i = 0; i < tensor->ne[1]; i++) {
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
// i * tensor->ne[0], 1, tensor->ne[0], im);
//}
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
@ -2220,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
}
#if 1
for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
@ -2239,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
}
}
#else
// m = a rows
// n = b rows
// k = cols
std::uniform_int_distribution<> dist_m(1, 128);
std::uniform_int_distribution<> dist_n(16, 128);
std::uniform_int_distribution<> dist_k(1, 16);
for (int i = 0; i < 1000; i++) {
for (ggml_type type_a : all_types) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
int m = dist_m(rng);
int n = dist_n(rng);
int k = dist_k(rng) * ggml_blck_size(type_a);
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
}
}
}
#endif
for (ggml_type type_a : other_types) {
for (ggml_type type_b : {GGML_TYPE_F32}) {