Merge branch 'master' of https://github.com/ggerganov/llama.cpp into hk
This commit is contained in:
commit
8ea2402195
37 changed files with 1664 additions and 1401 deletions
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
|
@ -860,7 +860,7 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
[](https://conan.io/center/llama-cpp)
|
[](https://conan.io/center/llama-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
|
@ -48,7 +48,7 @@ class Model:
|
||||||
|
|
||||||
dir_model: Path
|
dir_model: Path
|
||||||
ftype: gguf.LlamaFileType
|
ftype: gguf.LlamaFileType
|
||||||
fname_out: Path | None
|
fname_out: Path
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
use_temp_file: bool
|
use_temp_file: bool
|
||||||
|
@ -62,11 +62,12 @@ class Model:
|
||||||
gguf_writer: gguf.GGUFWriter
|
gguf_writer: gguf.GGUFWriter
|
||||||
model_name: str | None
|
model_name: str | None
|
||||||
metadata_override: Path | None
|
metadata_override: Path | None
|
||||||
|
dir_model_card: Path
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
|
@ -90,6 +91,7 @@ class Model:
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -237,6 +239,10 @@ class Model:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
|
|
||||||
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||||
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
logger.info(f"gguf: file type = {self.ftype}")
|
logger.info(f"gguf: file type = {self.ftype}")
|
||||||
|
|
||||||
|
@ -345,7 +351,7 @@ class Model:
|
||||||
|
|
||||||
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
||||||
|
|
||||||
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
|
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
||||||
|
|
||||||
# Fallback to model directory name if metadata name is still missing
|
# Fallback to model directory name if metadata name is still missing
|
||||||
if self.metadata.name is None:
|
if self.metadata.name is None:
|
||||||
|
@ -359,27 +365,22 @@ class Model:
|
||||||
output_type: str = self.ftype.name.partition("_")[2]
|
output_type: str = self.ftype.name.partition("_")[2]
|
||||||
|
|
||||||
# Filename Output
|
# Filename Output
|
||||||
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
if self.fname_out.is_dir():
|
||||||
# file template strings as it doesn't actually exist as a file
|
|
||||||
if self.fname_out is not None and not self.fname_out.is_dir():
|
|
||||||
# Output path is a custom defined templated filename
|
|
||||||
|
|
||||||
# Process templated file name with the output ftype, useful with the "auto" ftype
|
|
||||||
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
|
||||||
else:
|
|
||||||
# Generate default filename based on model specification and available metadata
|
# Generate default filename based on model specification and available metadata
|
||||||
if not vocab_only:
|
if not vocab_only:
|
||||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
||||||
else:
|
else:
|
||||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
||||||
|
|
||||||
# Check if preferred output directory path was provided
|
# Use the default filename
|
||||||
if self.fname_out is not None and self.fname_out.is_dir():
|
|
||||||
# output path is a directory
|
|
||||||
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
||||||
else:
|
else:
|
||||||
# output in the same directory as the model by default
|
# Output path is a custom defined templated filename
|
||||||
self.fname_out = self.dir_model / f"{fname_default}.gguf"
|
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
||||||
|
# file template strings as it doesn't actually exist as a file
|
||||||
|
|
||||||
|
# Process templated file name with the output ftype, useful with the "auto" ftype
|
||||||
|
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
||||||
|
|
||||||
self.set_type()
|
self.set_type()
|
||||||
|
|
||||||
|
@ -593,6 +594,15 @@ class Model:
|
||||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||||
# ref: https://huggingface.co/core42/jais-13b
|
# ref: https://huggingface.co/core42/jais-13b
|
||||||
res = "jais"
|
res = "jais"
|
||||||
|
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
||||||
|
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
||||||
|
res = "codeshell"
|
||||||
|
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
||||||
|
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
||||||
|
res = "tekken"
|
||||||
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||||
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||||
|
res = "smollm"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -733,7 +743,7 @@ class Model:
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -750,7 +760,8 @@ class Model:
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token: str = token_data["content"]
|
token: str = token_data["content"]
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token.encode("utf-8")
|
if tokens[token_id] != token.encode("utf-8"):
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
||||||
if token_data.get("special") or self.does_token_look_special(token):
|
if token_data.get("special") or self.does_token_look_special(token):
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
else:
|
else:
|
||||||
|
@ -1309,6 +1320,7 @@ class RefactModel(Model):
|
||||||
special_vocab._set_special_token("prefix", 1)
|
special_vocab._set_special_token("prefix", 1)
|
||||||
special_vocab._set_special_token("suffix", 3)
|
special_vocab._set_special_token("suffix", 3)
|
||||||
special_vocab._set_special_token("middle", 2)
|
special_vocab._set_special_token("middle", 2)
|
||||||
|
special_vocab.chat_template = None # do not add it twice
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
@ -1479,7 +1491,12 @@ class LlamaModel(Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
|
if "head_dim" in hparams:
|
||||||
|
rope_dim = hparams["head_dim"]
|
||||||
|
else:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
@ -1994,7 +2011,7 @@ class Phi3MiniModel(Model):
|
||||||
|
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -2011,7 +2028,8 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2027,7 +2045,8 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(foken_data["id"])
|
token_id = int(foken_data["id"])
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2068,7 +2087,7 @@ class Phi3MiniModel(Model):
|
||||||
|
|
||||||
# write rope scaling for long context (128k) model
|
# write rope scaling for long context (128k) model
|
||||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
if (rope_scaling is None):
|
if rope_scaling is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
scale = max_pos_embds / orig_max_pos_embds
|
scale = max_pos_embds / orig_max_pos_embds
|
||||||
|
@ -2266,7 +2285,8 @@ class InternLM2Model(Model):
|
||||||
chat_eos_token_id = token_id
|
chat_eos_token_id = token_id
|
||||||
token = token.encode("utf-8")
|
token = token.encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert(tokens[token_id] == token)
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2285,7 +2305,8 @@ class InternLM2Model(Model):
|
||||||
chat_eos_token_id = token_id
|
chat_eos_token_id = token_id
|
||||||
token = token.encode("utf-8")
|
token = token.encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert(tokens[token_id] == token)
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2471,6 +2492,7 @@ class GemmaModel(Model):
|
||||||
special_vocab._set_special_token("middle", 68)
|
special_vocab._set_special_token("middle", 68)
|
||||||
special_vocab._set_special_token("fsep", 70)
|
special_vocab._set_special_token("fsep", 70)
|
||||||
special_vocab._set_special_token("eot", 107)
|
special_vocab._set_special_token("eot", 107)
|
||||||
|
special_vocab.chat_template = None # do not add it twice
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
self.gguf_writer.add_add_space_prefix(False)
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
@ -2712,7 +2734,7 @@ class JinaBertV2Model(BertModel):
|
||||||
|
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
def set_vocab(self, *args, **kwargs):
|
def set_vocab(self):
|
||||||
tokenizer_class = 'BertTokenizer'
|
tokenizer_class = 'BertTokenizer'
|
||||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_class = json.load(f)['tokenizer_class']
|
tokenizer_class = json.load(f)['tokenizer_class']
|
||||||
|
@ -2860,7 +2882,7 @@ class ArcticModel(Model):
|
||||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
for token_id, token_json in added_tokens_decoder.items():
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -3109,7 +3131,7 @@ class T5Model(Model):
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -3420,7 +3442,6 @@ class ChatGLMModel(Model):
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
|
|
||||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
||||||
|
@ -3625,10 +3646,10 @@ def main() -> None:
|
||||||
logger.error("Error: Cannot use temp file when splitting")
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
fname_out = None
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
fname_out = dir_model
|
||||||
|
|
||||||
logger.info(f"Loading model: {dir_model.name}")
|
logger.info(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
|
@ -3659,7 +3680,6 @@ def main() -> None:
|
||||||
else:
|
else:
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
assert model_instance.fname_out is not None
|
|
||||||
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
||||||
logger.info(f"Model successfully exported to {out_path}")
|
logger.info(f"Model successfully exported to {out_path}")
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
|
@ -91,6 +91,9 @@ models = [
|
||||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||||
|
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||||
|
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||||
|
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
|
||||||
response = sess.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as downloaded_file:
|
||||||
f.write(response.content)
|
downloaded_file.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
@ -159,7 +162,7 @@ for model in models:
|
||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
logger.info(f"model: {name}")
|
logger.info(f"model: {name}")
|
||||||
|
@ -191,7 +194,7 @@ src_func = f"""
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = {repr(chktxt)}
|
chktxt = {repr(CHK_TXT)}
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(chktxt)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
@ -287,7 +290,7 @@ tests = [
|
||||||
"333333333",
|
"333333333",
|
||||||
"Cửa Việt", # llama-bpe fails on this
|
"Cửa Việt", # llama-bpe fails on this
|
||||||
" discards",
|
" discards",
|
||||||
chktxt,
|
CHK_TXT,
|
||||||
]
|
]
|
||||||
|
|
||||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||||
|
|
|
@ -132,6 +132,10 @@ class Tensor:
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
|
|
||||||
|
file_format: GGMLFormat
|
||||||
|
format_version: int
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
|
@ -290,7 +294,7 @@ class GGMLToGGUF:
|
||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
logger.info('* Adding vocab item(s)')
|
logger.info('* Adding vocab item(s)')
|
||||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
|
|
|
@ -290,7 +290,7 @@ if __name__ == '__main__':
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
# output in the same directory as the model by default
|
# output in the same directory as the model by default
|
||||||
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
|
fname_out = dir_lora
|
||||||
|
|
||||||
if os.path.exists(input_model):
|
if os.path.exists(input_model):
|
||||||
# lazy import load_file only if lora is in safetensors format.
|
# lazy import load_file only if lora is in safetensors format.
|
||||||
|
@ -304,12 +304,6 @@ if __name__ == '__main__':
|
||||||
# load base model
|
# load base model
|
||||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
hparams = Model.load_hparams(dir_base_model)
|
hparams = Model.load_hparams(dir_base_model)
|
||||||
|
|
||||||
with open(lora_config, "r") as f:
|
|
||||||
lparams: dict[str, Any] = json.load(f)
|
|
||||||
|
|
||||||
alpha: float = lparams["lora_alpha"]
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
try:
|
try:
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
@ -320,12 +314,21 @@ if __name__ == '__main__':
|
||||||
class LoraModel(model_class):
|
class LoraModel(model_class):
|
||||||
model_arch = model_class.model_arch
|
model_arch = model_class.model_arch
|
||||||
|
|
||||||
|
lora_alpha: float
|
||||||
|
|
||||||
|
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
|
||||||
|
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self.dir_model_card = dir_lora_model
|
||||||
|
self.lora_alpha = float(lora_alpha)
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
@ -368,6 +371,11 @@ if __name__ == '__main__':
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
|
with open(lora_config, "r") as f:
|
||||||
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
|
alpha: float = lparams["lora_alpha"]
|
||||||
|
|
||||||
model_instance = LoraModel(
|
model_instance = LoraModel(
|
||||||
dir_base_model,
|
dir_base_model,
|
||||||
ftype,
|
ftype,
|
||||||
|
@ -376,6 +384,8 @@ if __name__ == '__main__':
|
||||||
use_temp_file=False,
|
use_temp_file=False,
|
||||||
eager=args.no_lazy,
|
eager=args.no_lazy,
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
|
dir_lora_model=dir_lora,
|
||||||
|
lora_alpha=alpha,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
|
|
@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||||
|
|
|
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
return env->NewStringUTF("");
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||||
|
|
|
@ -26,11 +26,12 @@ actor LlamaContext {
|
||||||
private var context: OpaquePointer
|
private var context: OpaquePointer
|
||||||
private var batch: llama_batch
|
private var batch: llama_batch
|
||||||
private var tokens_list: [llama_token]
|
private var tokens_list: [llama_token]
|
||||||
|
var is_done: Bool = false
|
||||||
|
|
||||||
/// This variable is used to store temporarily invalid cchars
|
/// This variable is used to store temporarily invalid cchars
|
||||||
private var temporary_invalid_cchars: [CChar]
|
private var temporary_invalid_cchars: [CChar]
|
||||||
|
|
||||||
var n_len: Int32 = 64
|
var n_len: Int32 = 1024
|
||||||
var n_cur: Int32 = 0
|
var n_cur: Int32 = 0
|
||||||
|
|
||||||
var n_decode: Int32 = 0
|
var n_decode: Int32 = 0
|
||||||
|
@ -160,6 +161,7 @@ actor LlamaContext {
|
||||||
|
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
|
is_done = true
|
||||||
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||||
temporary_invalid_cchars.removeAll()
|
temporary_invalid_cchars.removeAll()
|
||||||
return new_token_str
|
return new_token_str
|
||||||
|
|
|
@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
|
||||||
messageLog += "\(text)"
|
messageLog += "\(text)"
|
||||||
|
|
||||||
Task.detached {
|
Task.detached {
|
||||||
while await llamaContext.n_cur < llamaContext.n_len {
|
while await !llamaContext.is_done {
|
||||||
let result = await llamaContext.completion_loop()
|
let result = await llamaContext.completion_loop()
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
self.messageLog += "\(result)"
|
self.messageLog += "\(result)"
|
||||||
|
|
289
examples/pydantic_models_to_grammar_examples.py
Normal file → Executable file
289
examples/pydantic_models_to_grammar_examples.py
Normal file → Executable file
|
@ -1,8 +1,15 @@
|
||||||
# Function calling example using pydantic models.
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""Function calling example using pydantic models."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import textwrap
|
||||||
|
import sys
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
|
||||||
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
||||||
|
|
||||||
|
|
||||||
# Function to get completion on the llama.cpp server with grammar.
|
def create_completion(host, prompt, gbnf_grammar):
|
||||||
def create_completion(prompt, grammar):
|
"""Calls the /completion API on llama-server.
|
||||||
|
|
||||||
|
See
|
||||||
|
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||||
|
"""
|
||||||
|
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
data = {"prompt": prompt, "grammar": grammar}
|
data = {"prompt": prompt, "grammar": gbnf_grammar}
|
||||||
|
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
|
||||||
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
assert data.get("error") is None, data
|
assert data.get("error") is None, data
|
||||||
|
logging.info("Result: %s", result)
|
||||||
print(data["content"])
|
content = result["content"]
|
||||||
return data["content"]
|
print(f" Model: {result['model']}")
|
||||||
|
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
# A function for the agent to send a message to the user.
|
# A function for the agent to send a message to the user.
|
||||||
class SendMessageToUser(BaseModel):
|
class SendMessageToUser(BaseModel):
|
||||||
"""
|
"""Send a message to the User."""
|
||||||
Send a message to the User.
|
|
||||||
"""
|
|
||||||
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
|
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
|
||||||
message: str = Field(..., description="Message you want to send to the user.")
|
message: str = Field(..., description="Message you want to send to the user.")
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
print(self.message)
|
print(f"SendMessageToUser: {self.message}")
|
||||||
|
|
||||||
|
|
||||||
|
def example_rce(host):
|
||||||
|
"""Minimal test case where the LLM call an arbitrary python function."""
|
||||||
|
print("- example_rce")
|
||||||
|
tools = [SendMessageToUser]
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
|
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
||||||
|
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
||||||
|
user_message = "What is 42 * 42?"
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
|
json_data = json.loads(text)
|
||||||
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
|
# This finds "SendMessageToUser":
|
||||||
|
tool = tools_map.get(json_data["function"])
|
||||||
|
if not tool:
|
||||||
|
print(f"Error: unknown tool {json_data['function']}")
|
||||||
|
return 1
|
||||||
|
tool(**json_data["function_parameters"]).run()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# Enum for the calculator tool.
|
# Enum for the calculator tool.
|
||||||
|
@ -46,11 +77,11 @@ class MathOperation(Enum):
|
||||||
DIVIDE = "divide"
|
DIVIDE = "divide"
|
||||||
|
|
||||||
|
|
||||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
# Simple pydantic calculator tool for the agent that can add, subtract,
|
||||||
|
# multiply, and divide. Docstring and description of fields will be used in
|
||||||
|
# system prompt.
|
||||||
class Calculator(BaseModel):
|
class Calculator(BaseModel):
|
||||||
"""
|
"""Perform a math operation on two numbers."""
|
||||||
Perform a math operation on two numbers.
|
|
||||||
"""
|
|
||||||
number_one: Union[int, float] = Field(..., description="First number.")
|
number_one: Union[int, float] = Field(..., description="First number.")
|
||||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||||
|
@ -68,55 +99,61 @@ class Calculator(BaseModel):
|
||||||
raise ValueError("Unknown operation.")
|
raise ValueError("Unknown operation.")
|
||||||
|
|
||||||
|
|
||||||
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
|
def example_calculator(host):
|
||||||
# pydantic_model_list is the list of pydanitc models
|
"""Have the LLM ask to get a calculation done.
|
||||||
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
|
|
||||||
# outer_object_content is the name of outer object content.
|
Here the grammar gets generated by passing the available function models to
|
||||||
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
|
generate_gbnf_grammar_and_documentation function. This also generates a
|
||||||
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
|
documentation usable by the LLM.
|
||||||
|
|
||||||
|
pydantic_model_list is the list of pydantic models outer_object_name is an
|
||||||
|
optional name for an outer object around the actual model object. Like a
|
||||||
|
"function" object with "function_parameters" which contains the actual model
|
||||||
|
object. If None, no outer object will be generated outer_object_content is
|
||||||
|
the name of outer object content.
|
||||||
|
|
||||||
|
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
|
||||||
|
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
|
||||||
|
"""
|
||||||
|
print("- example_calculator")
|
||||||
|
tools = [SendMessageToUser, Calculator]
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
||||||
|
|
||||||
print(gbnf_grammar)
|
|
||||||
print(documentation)
|
|
||||||
|
|
||||||
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
||||||
|
user_message1 = "What is 42 * 42?"
|
||||||
user_message = "What is 42 * 42?"
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
|
json_data = json.loads(text)
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
expected = {
|
||||||
# This should output something like this:
|
"function": "Calculator",
|
||||||
# {
|
"function_parameters": {
|
||||||
# "function": "calculator",
|
"number_one": 42,
|
||||||
# "function_parameters": {
|
"operation": "multiply",
|
||||||
# "number_one": 42,
|
"number_two": 42
|
||||||
# "operation": "multiply",
|
}
|
||||||
# "number_two": 42
|
}
|
||||||
# }
|
if json_data != expected:
|
||||||
# }
|
print(" Result is not as expected!")
|
||||||
function_dictionary = json.loads(text)
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
if function_dictionary["function"] == "calculator":
|
# This finds "Calculator":
|
||||||
function_parameters = {**function_dictionary["function_parameters"]}
|
tool = tools_map.get(json_data["function"])
|
||||||
|
if not tool:
|
||||||
print(Calculator(**function_parameters).run())
|
print(f"Error: unknown tool {json_data['function']}")
|
||||||
# This should output: 1764
|
return 1
|
||||||
|
result = tool(**json_data["function_parameters"]).run()
|
||||||
|
print(f" Call {json_data['function']} gave result {result}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
|
|
||||||
class Category(Enum):
|
class Category(Enum):
|
||||||
"""
|
"""The category of the book."""
|
||||||
The category of the book.
|
|
||||||
"""
|
|
||||||
Fiction = "Fiction"
|
Fiction = "Fiction"
|
||||||
NonFiction = "Non-Fiction"
|
NonFiction = "Non-Fiction"
|
||||||
|
|
||||||
|
|
||||||
class Book(BaseModel):
|
class Book(BaseModel):
|
||||||
"""
|
"""Represents an entry about a book."""
|
||||||
Represents an entry about a book.
|
|
||||||
"""
|
|
||||||
title: str = Field(..., description="Title of the book.")
|
title: str = Field(..., description="Title of the book.")
|
||||||
author: str = Field(..., description="Author of the book.")
|
author: str = Field(..., description="Author of the book.")
|
||||||
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
|
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
|
||||||
|
@ -125,33 +162,42 @@ class Book(BaseModel):
|
||||||
summary: str = Field(..., description="Summary of the book.")
|
summary: str = Field(..., description="Summary of the book.")
|
||||||
|
|
||||||
|
|
||||||
# We need no additional parameters other than our list of pydantic models.
|
def example_struct(host):
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
|
"""A example structured output based on pydantic models.
|
||||||
|
|
||||||
|
The LLM will create an entry for a Book database out of an unstructured
|
||||||
|
text. We need no additional parameters other than our list of pydantic
|
||||||
|
models.
|
||||||
|
"""
|
||||||
|
print("- example_struct")
|
||||||
|
tools = [Book]
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
|
||||||
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
|
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
|
||||||
|
|
||||||
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
|
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
|
||||||
|
|
||||||
json_data = json.loads(text)
|
json_data = json.loads(text)
|
||||||
|
# In this case, there's no function nor function_parameters.
|
||||||
|
# Here the result will vary based on the LLM used.
|
||||||
|
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
|
||||||
|
if keys != sorted(json_data.keys()):
|
||||||
|
print(f"Unexpected result: {sorted(json_data.keys())}")
|
||||||
|
return 1
|
||||||
|
book = Book(**json_data)
|
||||||
|
print(f" As a Book object: %s" % book)
|
||||||
|
return 0
|
||||||
|
|
||||||
print(Book(**json_data))
|
|
||||||
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
|
|
||||||
|
|
||||||
def get_current_datetime(output_format: Optional[str] = None):
|
def get_current_datetime(output_format: Optional[str] = None):
|
||||||
"""
|
"""Get the current date and time in the given format.
|
||||||
Get the current date and time in the given format.
|
|
||||||
Args:
|
Args:
|
||||||
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
||||||
"""
|
"""
|
||||||
if output_format is None:
|
return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
|
||||||
output_format = '%Y-%m-%d %H:%M:%S'
|
|
||||||
return datetime.datetime.now().strftime(output_format)
|
|
||||||
|
|
||||||
|
|
||||||
# Example function to get the weather
|
# Example function to get the weather.
|
||||||
def get_current_weather(location, unit):
|
def get_current_weather(location, unit):
|
||||||
"""Get the current weather in a given location"""
|
"""Get the current weather in a given location"""
|
||||||
if "London" in location:
|
if "London" in location:
|
||||||
|
@ -160,11 +206,15 @@ def get_current_weather(location, unit):
|
||||||
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
||||||
elif "North Pole" in location:
|
elif "North Pole" in location:
|
||||||
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
||||||
else:
|
|
||||||
return json.dumps({"location": location, "temperature": "unknown"})
|
return json.dumps({"location": location, "temperature": "unknown"})
|
||||||
|
|
||||||
|
|
||||||
# Here is a function definition in OpenAI style
|
def example_concurrent(host):
|
||||||
|
"""An example for parallel function calling with a Python function, a pydantic
|
||||||
|
function model and an OpenAI like function definition.
|
||||||
|
"""
|
||||||
|
print("- example_concurrent")
|
||||||
|
# Function definition in OpenAI style.
|
||||||
current_weather_tool = {
|
current_weather_tool = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
|
@ -183,45 +233,80 @@ current_weather_tool = {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
# Convert OpenAI function definition into pydantic model.
|
||||||
# Convert OpenAI function definition into pydantic model
|
|
||||||
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
||||||
# Add the actual function to a pydantic model
|
# Add the actual function to a pydantic model.
|
||||||
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
||||||
|
|
||||||
# Convert normal Python function to a pydantic model
|
# Convert normal Python function to a pydantic model.
|
||||||
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
||||||
|
|
||||||
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
||||||
|
|
||||||
|
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
pydantic_model_list=tool_list, outer_object_name="function",
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
||||||
|
|
||||||
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
||||||
|
|
||||||
|
|
||||||
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
|
||||||
|
|
||||||
json_data = json.loads(text)
|
json_data = json.loads(text)
|
||||||
|
expected = [
|
||||||
print(json_data)
|
{
|
||||||
# Should output something like this:
|
"function": "get_current_datetime",
|
||||||
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
|
"params": {
|
||||||
|
"output_format": "%Y-%m-%d %H:%M:%S"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"function": "get_current_weather",
|
||||||
|
"params": {
|
||||||
|
"location": "London",
|
||||||
|
"unit": "celsius"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"function": "Calculator",
|
||||||
|
"params": {
|
||||||
|
"number_one": 42,
|
||||||
|
"operation": "multiply",
|
||||||
|
"number_two": 42
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
res = 0
|
||||||
|
if json_data != expected:
|
||||||
|
print(" Result is not as expected!")
|
||||||
|
print(" This can happen on highly quantized models")
|
||||||
|
res = 1
|
||||||
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
for call in json_data:
|
for call in json_data:
|
||||||
if call["function"] == "Calculator":
|
tool = tools_map.get(call["function"])
|
||||||
print(Calculator(**call["params"]).run())
|
if not tool:
|
||||||
elif call["function"] == "get_current_datetime":
|
print(f"Error: unknown tool {call['function']}")
|
||||||
print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
|
return 1
|
||||||
elif call["function"] == "get_current_weather":
|
result = tool(**call["params"]).run()
|
||||||
print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
|
print(f" Call {call['function']} returned {result}")
|
||||||
# Should output something like this:
|
# Should output something like this:
|
||||||
# 2024-01-14 13:36:06
|
# Call get_current_datetime returned 2024-07-15 09:50:38
|
||||||
# {"location": "London", "temperature": "42", "unit": "celsius"}
|
# Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
|
||||||
# 1764
|
# Call Calculator returned 1764
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
|
||||||
|
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
|
||||||
|
args = parser.parse_args()
|
||||||
|
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
|
||||||
|
ret = 0
|
||||||
|
# Comment out below to only run the example you want.
|
||||||
|
ret = ret or example_rce(args.host)
|
||||||
|
ret = ret or example_calculator(args.host)
|
||||||
|
ret = ret or example_struct(args.host)
|
||||||
|
ret = ret or example_concurrent(args.host)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
|
|
|
@ -444,7 +444,7 @@ node index.js
|
||||||
|
|
||||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1720768451,
|
"lastModified": 1721379653,
|
||||||
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
|
"narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
|
"rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
|
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
mmq_supported = true;
|
mmq_supported = true;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
|
||||||
TYPES_MMQ = [
|
TYPES_MMQ = [
|
||||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||||
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||||
|
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
||||||
]
|
]
|
||||||
|
|
||||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
|
|
@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
||||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
|
||||||
|
const int * v, const int * u, const float * d8_0, const float & d8_1) {
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
|
||||||
|
int sumi = 0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = i0; i < i0 + QI8_0/2; ++i) {
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d8_0[i0/(QI8_0/2)]*sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
return d8_1*sumf;
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||||
|
|
||||||
|
|
|
@ -1786,10 +1786,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (ggml_is_quantized(src0t)) {
|
|
||||||
GGML_ASSERT(ne00 >= nth0*nth1);
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
|
|
@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
device const float4 * y4 = (device const float4 *)yb;
|
device const float4 * y4 = (device const float4 *)yb;
|
||||||
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||||
|
|
||||||
device const block_iq4_nl & xb = x[row*nb + ib];
|
device const block_iq4_nl & xb = x[row*nb + ib];
|
||||||
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
||||||
|
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
yb += 16 * QK4_NL;
|
yb += 16 * QK4_NL;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -19019,7 +19019,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
FILE * fout = ggml_fopen(fname, "wb");
|
FILE * fout = ggml_fopen(fname, "wb");
|
||||||
|
|
||||||
if (!fout) {
|
if (!fout) {
|
||||||
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19156,7 +19156,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
||||||
{
|
{
|
||||||
FILE * fin = ggml_fopen(fname, "rb");
|
FILE * fin = ggml_fopen(fname, "rb");
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20830,6 +20830,7 @@ struct gguf_context * gguf_init_empty(void) {
|
||||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||||
FILE * file = ggml_fopen(fname, "rb");
|
FILE * file = ggml_fopen(fname, "rb");
|
||||||
if (!file) {
|
if (!file) {
|
||||||
|
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21014,7 +21015,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
gguf_tensor_info_sanitize(info);
|
gguf_tensor_info_sanitize(info);
|
||||||
|
|
||||||
// make sure there is no duplicated tensor names
|
// make sure there is no duplicated tensor names
|
||||||
for (uint64_t j = 0; j < i; ++j) {
|
for (uint64_t j = 0; j < i && ok; ++j) {
|
||||||
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
||||||
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
||||||
ok = false;
|
ok = false;
|
||||||
|
|
|
@ -54,6 +54,7 @@ class Metadata:
|
||||||
|
|
||||||
model_card = Metadata.load_model_card(model_path)
|
model_card = Metadata.load_model_card(model_path)
|
||||||
hf_params = Metadata.load_hf_parameters(model_path)
|
hf_params = Metadata.load_hf_parameters(model_path)
|
||||||
|
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
|
||||||
|
|
||||||
# heuristics
|
# heuristics
|
||||||
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
|
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
|
||||||
|
@ -62,6 +63,7 @@ class Metadata:
|
||||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||||
metadata_override = Metadata.load_metadata_override(metadata_override_path)
|
metadata_override = Metadata.load_metadata_override(metadata_override_path)
|
||||||
|
|
||||||
|
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
|
||||||
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
|
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
|
||||||
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
|
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
|
||||||
metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization)
|
metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization)
|
||||||
|
@ -176,6 +178,12 @@ class Metadata:
|
||||||
org_component = None
|
org_component = None
|
||||||
|
|
||||||
name_parts: list[str] = model_full_name_component.split('-')
|
name_parts: list[str] = model_full_name_component.split('-')
|
||||||
|
|
||||||
|
# Remove empty parts
|
||||||
|
for i in reversed(range(len(name_parts))):
|
||||||
|
if len(name_parts[i]) == 0:
|
||||||
|
del name_parts[i]
|
||||||
|
|
||||||
name_types: list[
|
name_types: list[
|
||||||
set[Literal["basename", "size_label", "finetune", "version", "type"]]
|
set[Literal["basename", "size_label", "finetune", "version", "type"]]
|
||||||
] = [set() for _ in name_parts]
|
] = [set() for _ in name_parts]
|
||||||
|
@ -222,9 +230,19 @@ class Metadata:
|
||||||
name_parts[i] = part
|
name_parts[i] = part
|
||||||
# Some easy to recognize finetune names
|
# Some easy to recognize finetune names
|
||||||
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
|
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
|
||||||
|
if total_params < 0 and part.lower() == "lora":
|
||||||
|
# ignore redundant "lora" in the finetune part when the output is a lora adapter
|
||||||
|
name_types[i].add("type")
|
||||||
|
else:
|
||||||
name_types[i].add("finetune")
|
name_types[i].add("finetune")
|
||||||
if part.lower() == "lora":
|
|
||||||
name_parts[i] = "LoRA"
|
# Ignore word-based size labels when there is at least a number-based one present
|
||||||
|
# TODO: should word-based size labels always be removed instead?
|
||||||
|
if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
|
||||||
|
for n, t in zip(name_parts, name_types):
|
||||||
|
if "size_label" in t:
|
||||||
|
if all(c.isalpha() for c in n):
|
||||||
|
t.remove("size_label")
|
||||||
|
|
||||||
at_start = True
|
at_start = True
|
||||||
# Find the basename through the annotated name
|
# Find the basename through the annotated name
|
||||||
|
@ -239,18 +257,18 @@ class Metadata:
|
||||||
|
|
||||||
# Remove the basename annotation from trailing version
|
# Remove the basename annotation from trailing version
|
||||||
for part, t in zip(reversed(name_parts), reversed(name_types)):
|
for part, t in zip(reversed(name_parts), reversed(name_types)):
|
||||||
if "basename" in t:
|
if "basename" in t and len(t) > 1:
|
||||||
if len(t) > 1:
|
|
||||||
t.remove("basename")
|
t.remove("basename")
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
|
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
|
||||||
size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
|
# Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
|
||||||
|
size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
|
||||||
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
|
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
|
||||||
# TODO: should the basename version always be excluded?
|
# TODO: should the basename version always be excluded?
|
||||||
# TODO: should multiple versions be joined together?
|
# NOTE: multiple finetune versions are joined together
|
||||||
version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
|
version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
|
||||||
|
|
||||||
if size_label is None and finetune is None and version is None:
|
if size_label is None and finetune is None and version is None:
|
||||||
# Too ambiguous, output nothing
|
# Too ambiguous, output nothing
|
||||||
|
|
|
@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
|
||||||
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||||
|
|
||||||
if base_name is not None:
|
if base_name is not None:
|
||||||
name = base_name.strip().title().replace(' ', '-').replace('/', '-')
|
name = base_name.strip().replace(' ', '-').replace('/', '-')
|
||||||
elif model_name is not None:
|
elif model_name is not None:
|
||||||
name = model_name.strip().title().replace(' ', '-').replace('/', '-')
|
name = model_name.strip().replace(' ', '-').replace('/', '-')
|
||||||
else:
|
else:
|
||||||
name = "ggml-model"
|
name = "ggml-model"
|
||||||
|
|
||||||
parameters = f"-{size_label}" if size_label is not None else ""
|
parameters = f"-{size_label}" if size_label is not None else ""
|
||||||
|
|
||||||
finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
|
finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
|
||||||
|
|
||||||
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
|
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import annotations
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
else:
|
else:
|
||||||
pretty_type = str(field.types[-1].name)
|
pretty_type = str(field.types[-1].name)
|
||||||
|
|
||||||
|
def escape_markdown_inline_code(value_string):
|
||||||
|
# Find the longest contiguous sequence of backticks in the string then
|
||||||
|
# wrap string with appropriate number of backticks required to escape it
|
||||||
|
max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
|
||||||
|
inline_code_marker = '`' * (max_backticks + 1)
|
||||||
|
|
||||||
|
# If the string starts or ends with a backtick, add a space at the beginning and end
|
||||||
|
if value_string.startswith('`') or value_string.endswith('`'):
|
||||||
|
value_string = f" {value_string} "
|
||||||
|
|
||||||
|
return f"{inline_code_marker}{value_string}{inline_code_marker}"
|
||||||
|
|
||||||
total_elements = len(field.data)
|
total_elements = len(field.data)
|
||||||
value = ""
|
value = ""
|
||||||
if len(field.types) == 1:
|
if len(field.types) == 1:
|
||||||
curr_type = field.types[0]
|
curr_type = field.types[0]
|
||||||
if curr_type == GGUFValueType.STRING:
|
if curr_type == GGUFValueType.STRING:
|
||||||
value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
|
truncate_length = 60
|
||||||
|
value_string = str(bytes(field.parts[-1]), encoding='utf-8')
|
||||||
|
if len(value_string) > truncate_length:
|
||||||
|
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
|
||||||
|
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
|
||||||
|
value = "{head}...{tail}".format(head=head, tail=tail)
|
||||||
|
else:
|
||||||
|
value = escape_markdown_inline_code(value_string)
|
||||||
elif curr_type in reader.gguf_scalar_to_np:
|
elif curr_type in reader.gguf_scalar_to_np:
|
||||||
value = str(field.parts[-1][0])
|
value = str(field.parts[-1][0])
|
||||||
else:
|
else:
|
||||||
if field.types[0] == GGUFValueType.ARRAY:
|
if field.types[0] == GGUFValueType.ARRAY:
|
||||||
curr_type = field.types[1]
|
curr_type = field.types[1]
|
||||||
|
array_elements = []
|
||||||
|
|
||||||
if curr_type == GGUFValueType.STRING:
|
if curr_type == GGUFValueType.STRING:
|
||||||
render_element = min(5, total_elements)
|
render_element = min(5, total_elements)
|
||||||
for element_pos in range(render_element):
|
for element_pos in range(render_element):
|
||||||
value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
|
truncate_length = 30
|
||||||
|
value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
|
||||||
|
if len(value_string) > truncate_length:
|
||||||
|
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
|
||||||
|
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
|
||||||
|
value = "{head}...{tail}".format(head=head, tail=tail)
|
||||||
|
else:
|
||||||
|
value = escape_markdown_inline_code(value_string)
|
||||||
|
array_elements.append(value)
|
||||||
|
|
||||||
elif curr_type in reader.gguf_scalar_to_np:
|
elif curr_type in reader.gguf_scalar_to_np:
|
||||||
render_element = min(7, total_elements)
|
render_element = min(7, total_elements)
|
||||||
for element_pos in range(render_element):
|
for element_pos in range(render_element):
|
||||||
value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
|
array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
|
||||||
value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
|
|
||||||
|
value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
|
||||||
|
|
||||||
kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
|
kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
|
||||||
|
|
||||||
kv_dump_table_header_map = [
|
kv_dump_table_header_map = [
|
||||||
|
|
|
@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
|
||||||
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
|
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
|
||||||
('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
|
('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
|
||||||
|
|
||||||
# Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
|
# Non standard naming
|
||||||
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
|
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
|
||||||
('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
|
('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
|
||||||
self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
|
self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
|
||||||
('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
|
('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
|
||||||
|
|
||||||
# None standard and not easy to disambiguate
|
# Non standard and not easy to disambiguate
|
||||||
self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
|
self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
|
||||||
('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
|
('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
|
||||||
|
|
||||||
|
@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
|
||||||
self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
|
self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
|
||||||
('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
|
('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
|
||||||
|
|
||||||
|
# Ignore full-text size labels when there are number-based ones, and deduplicate size labels
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
|
||||||
|
('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
|
||||||
|
|
||||||
|
# Instruct in a name without a size label
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
|
||||||
|
('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
|
||||||
|
|
||||||
|
# Non-obvious splitting relying on 'chat' keyword
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
|
||||||
|
('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
|
||||||
|
|
||||||
|
# Multiple versions
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
|
||||||
|
('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
|
||||||
|
|
||||||
|
# TODO: DPO in the name
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
|
||||||
|
('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
|
||||||
|
|
||||||
|
# DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
|
||||||
|
('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
|
||||||
|
|
||||||
|
# Too ambiguous
|
||||||
|
# TODO: should "base" be a 'finetune' or 'size_label'?
|
||||||
|
# (in this case it should be a size label, but other models use it to signal that they are not finetuned)
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
|
||||||
|
('Florence-2-base', 'microsoft', None, None, None, None))
|
||||||
|
|
||||||
|
## Invalid cases ##
|
||||||
|
|
||||||
|
# Start with a dash and has dashes in rows
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
|
||||||
|
('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
|
||||||
|
|
||||||
|
## LoRA ##
|
||||||
|
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
|
||||||
|
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
|
||||||
|
|
||||||
|
# Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
|
||||||
|
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
|
||||||
|
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
|
||||||
|
|
||||||
def test_apply_metadata_heuristic_from_model_card(self):
|
def test_apply_metadata_heuristic_from_model_card(self):
|
||||||
model_card = {
|
model_card = {
|
||||||
'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
|
'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
|
||||||
|
@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
|
||||||
}
|
}
|
||||||
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
|
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
|
||||||
expect = gguf.Metadata()
|
expect = gguf.Metadata()
|
||||||
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
|
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
|
||||||
expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
|
expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
|
||||||
expect.languages=['en']
|
expect.languages=['en']
|
||||||
expect.datasets=['teknium/OpenHermes-2.5']
|
expect.datasets=['teknium/OpenHermes-2.5']
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 6
|
#define LLAMA_SESSION_VERSION 7
|
||||||
|
|
||||||
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||||
#define LLAMA_STATE_SEQ_VERSION 1
|
#define LLAMA_STATE_SEQ_VERSION 1
|
||||||
|
@ -92,6 +92,9 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -114,7 +114,7 @@
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
#define LLAMA_MAX_NODES 8192
|
#define LLAMA_MAX_NODES 8192
|
||||||
#define LLAMA_MAX_LAYERS 256
|
#define LLAMA_MAX_LAYERS 512
|
||||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -3875,7 +3875,7 @@ struct llama_model_loader {
|
||||||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||||
|
|
||||||
{
|
{
|
||||||
const int kid = gguf_find_key(meta, "general.file_type");
|
const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
|
||||||
if (kid >= 0) {
|
if (kid >= 0) {
|
||||||
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
||||||
}
|
}
|
||||||
|
@ -4007,7 +4007,9 @@ struct llama_model_loader {
|
||||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(arr_info.length <= N_MAX);
|
if (arr_info.length > N_MAX) {
|
||||||
|
throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
|
||||||
|
}
|
||||||
|
|
||||||
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
||||||
|
|
||||||
|
@ -4043,8 +4045,6 @@ struct llama_model_loader {
|
||||||
// get array of n <= N_MAX elements, or a single element repeated n times
|
// get array of n <= N_MAX elements, or a single element repeated n times
|
||||||
template<typename T, size_t N_MAX>
|
template<typename T, size_t N_MAX>
|
||||||
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
|
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
|
||||||
GGML_ASSERT(n <= N_MAX);
|
|
||||||
|
|
||||||
const int kid = gguf_find_key(meta, key.c_str());
|
const int kid = gguf_find_key(meta, key.c_str());
|
||||||
|
|
||||||
if (kid < 0) {
|
if (kid < 0) {
|
||||||
|
@ -4054,6 +4054,10 @@ struct llama_model_loader {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n > N_MAX) {
|
||||||
|
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
|
if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
|
||||||
struct GGUFMeta::ArrayInfo arr_info =
|
struct GGUFMeta::ArrayInfo arr_info =
|
||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
||||||
|
@ -5003,7 +5007,7 @@ static void llm_load_hparams(
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 42: model.type = e_model::MODEL_SMALL; break;
|
case 42: model.type = e_model::MODEL_7B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -5365,6 +5369,7 @@ static void llm_load_vocab(
|
||||||
if (merges_keyidx == -1) {
|
if (merges_keyidx == -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||||
for (int i = 0; i < n_merges; i++) {
|
for (int i = 0; i < n_merges; i++) {
|
||||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||||
|
@ -5403,16 +5408,6 @@ static void llm_load_vocab(
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = -1;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = -1;
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
||||||
if (add_space_prefix_keyidx != -1) {
|
|
||||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
||||||
} // The default value of add_space_prefix is true.
|
|
||||||
|
|
||||||
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
|
|
||||||
if (remove_extra_whitespaces_keyidx != -1) {
|
|
||||||
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
|
|
||||||
} // The default value of remove_extra_whitespaces is false.
|
|
||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||||
|
@ -5520,6 +5515,19 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "jais") {
|
tokenizer_pre == "jais") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "tekken") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
||||||
|
vocab.tokenizer_clean_spaces = false;
|
||||||
|
vocab.tokenizer_ignore_merges = true;
|
||||||
|
vocab.tokenizer_add_bos = true;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "smollm") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
||||||
|
vocab.tokenizer_clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "codeshell") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -5543,10 +5551,8 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
|
||||||
if (add_space_prefix_keyidx != -1) {
|
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
|
||||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||||
|
@ -6127,10 +6133,10 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
@ -15544,6 +15550,8 @@ struct llm_tokenizer_bpe {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\p{N}",
|
"\\p{N}",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
|
@ -15581,6 +15589,13 @@ struct llm_tokenizer_bpe {
|
||||||
"\\p{N}",
|
"\\p{N}",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
||||||
|
// original regex from tokenizer.json
|
||||||
|
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
regex_exprs = {
|
||||||
|
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
|
@ -18271,8 +18286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv (ctx_out, ml.meta);
|
gguf_set_kv (ctx_out, ml.meta);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
||||||
|
|
||||||
// Remove split metadata
|
// Remove split metadata
|
||||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
||||||
|
@ -19435,7 +19451,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
case LLM_ARCH_CODESHELL:
|
|
||||||
case LLM_ARCH_ORION:
|
case LLM_ARCH_ORION:
|
||||||
case LLM_ARCH_INTERNLM2:
|
case LLM_ARCH_INTERNLM2:
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
|
@ -19465,6 +19480,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
case LLM_ARCH_OPENELM:
|
case LLM_ARCH_OPENELM:
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
|
case LLM_ARCH_CODESHELL:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
|
@ -19920,7 +19936,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
||||||
);
|
);
|
||||||
|
|
||||||
// on session change it is very likely that the state size has changed - so we need to update this function
|
// on session change it is very likely that the state size has changed - so we need to update this function
|
||||||
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
||||||
|
|
||||||
return s_total;
|
return s_total;
|
||||||
}
|
}
|
||||||
|
@ -21607,7 +21623,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
|
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
|
||||||
ss << "[gMASK]" << "<sop>";
|
ss << "[gMASK]" << "<sop>";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
|
|
@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
|
||||||
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
||||||
install(TARGETS test-tokenizer-0 RUNTIME)
|
install(TARGETS test-tokenizer-0 RUNTIME)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||||
# TODO: enable when fixed
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/7036
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
|
|
||||||
# build test-tokenizer-1-bpe target once and add many tests
|
# build test-tokenizer-1-bpe target once and add many tests
|
||||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||||
|
@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||||
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||||
|
|
||||||
# TODO: disabled due to slowness
|
# TODO: disabled due to slowness
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||||
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
|
|
||||||
|
|
||||||
# build test-tokenizer-1-spm target once and add many tests
|
# build test-tokenizer-1-spm target once and add many tests
|
||||||
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
||||||
|
|
|
@ -79,8 +79,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
im = nullptr;
|
im = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||||
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||||
|
// TODO: other cases
|
||||||
|
//#pragma omp parallel for
|
||||||
|
//for (int i = 0; i < tensor->ne[1]; i++) {
|
||||||
|
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
||||||
|
// i * tensor->ne[0], 1, tensor->ne[0], im);
|
||||||
|
//}
|
||||||
|
|
||||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||||
// This is going to create some weird integers though.
|
// This is going to create some weird integers though.
|
||||||
|
@ -2220,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
for (ggml_type type_a : base_types) {
|
for (ggml_type type_a : base_types) {
|
||||||
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
||||||
|
@ -2239,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
// m = a rows
|
||||||
|
// n = b rows
|
||||||
|
// k = cols
|
||||||
|
std::uniform_int_distribution<> dist_m(1, 128);
|
||||||
|
std::uniform_int_distribution<> dist_n(16, 128);
|
||||||
|
std::uniform_int_distribution<> dist_k(1, 16);
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
for (ggml_type type_a : all_types) {
|
||||||
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||||
|
int m = dist_m(rng);
|
||||||
|
int n = dist_n(rng);
|
||||||
|
int k = dist_k(rng) * ggml_blck_size(type_a);
|
||||||
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (ggml_type type_a : other_types) {
|
for (ggml_type type_a : other_types) {
|
||||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue