diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1f275603a..a1e183d11 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,7 +860,7 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) - name: Determine tag name id: tag diff --git a/README.md b/README.md index 058919068..7c233b5e1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cdab8b58e..898f67c86 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -48,34 +48,39 @@ class Model: dir_model: Path ftype: gguf.LlamaFileType + fname_out: Path is_big_endian: bool endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - model_name: str | None part_names: list[str] is_safetensors: bool hparams: dict[str, Any] block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None - fname_out: Path gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model self.ftype = ftype + self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager - self.model_name = model_name self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: @@ -84,6 +89,11 @@ class Model: self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) @@ -93,10 +103,8 @@ class Model: else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @@ -193,7 +201,6 @@ class Model: return new_name def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: @@ -232,6 +239,10 @@ class Model: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -250,7 +261,7 @@ class Model: return False - def write_tensors(self): + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): @@ -333,9 +344,62 @@ class Model: self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model tokenizer") + self.set_vocab() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file(self.fname_out) + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() @@ -343,7 +407,9 @@ class Model: def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.write_header_to_file(self.fname_out) + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -528,6 +594,9 @@ class Model: if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": # ref: https://huggingface.co/core42/jais-13b res = "jais" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" @@ -688,7 +757,8 @@ class Model: token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token.encode("utf-8") + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: @@ -783,7 +853,6 @@ class GPTNeoXModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -839,7 +908,6 @@ class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) @@ -916,7 +984,6 @@ class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_block_count(block_count) @@ -955,7 +1022,6 @@ class OrionModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -968,8 +1034,6 @@ class OrionModel(Model): raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -993,7 +1057,6 @@ class BaichuanModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1005,8 +1068,6 @@ class BaichuanModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1120,7 +1181,6 @@ class XverseModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1132,8 +1192,6 @@ class XverseModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1192,7 +1250,6 @@ class FalconModel(Model): if n_head_kv is None: n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_name("Falcon") self.gguf_writer.add_context_length(2048) # not in config.json self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1237,7 +1294,6 @@ class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("StarCoder") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -1261,6 +1317,7 @@ class RefactModel(Model): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1272,7 +1329,6 @@ class RefactModel(Model): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Refact") # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1327,7 +1383,6 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1389,8 +1444,8 @@ class StableLMModel(Model): return [(new_name, data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` @@ -1433,7 +1488,12 @@ class LlamaModel(Model): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": @@ -1506,8 +1566,8 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1570,7 +1630,6 @@ class GrokModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") _experts: list[dict[str, Tensor]] | None = None @@ -1619,7 +1678,6 @@ class DbrxModel(Model): def set_gguf_parameters(self): ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_block_count(self.hparams["n_layers"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) @@ -1688,7 +1746,6 @@ class MiniCPMModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1758,7 +1815,6 @@ class QwenModel(Model): self._set_vocab_qwen() def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1834,8 +1890,8 @@ class Qwen2MoeModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1849,7 +1905,6 @@ class GPT2Model(Model): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1892,7 +1947,6 @@ class Phi2Model(Model): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_name("Phi2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) @@ -1971,7 +2025,8 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1987,7 +2042,8 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2014,7 +2070,6 @@ class Phi3MiniModel(Model): orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) rope_dims = n_embd // n_head - self.gguf_writer.add_name("Phi3") self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) @@ -2071,7 +2126,6 @@ class PlamoModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name("PLaMo") self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) @@ -2116,7 +2170,6 @@ class CodeShellModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("CodeShell") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -2229,7 +2282,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2248,7 +2302,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2275,7 +2330,6 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -2435,6 +2489,7 @@ class GemmaModel(Model): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -2443,7 +2498,6 @@ class GemmaModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2484,7 +2538,6 @@ class Gemma2Model(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2559,7 +2612,6 @@ class MambaModel(Model): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading @@ -2738,7 +2790,6 @@ class OpenELMModel(Model): assert self.block_count == len(self._num_query_heads) assert self.block_count == len(self._ffn_dims) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams["max_context_length"]) self.gguf_writer.add_embedding_length(n_embd) @@ -2912,8 +2963,8 @@ class ArcticModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -2991,8 +3042,8 @@ class DeepseekV2Model(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -3110,7 +3161,6 @@ class T5Model(Model): self.gguf_writer.add_add_eos_token(True) def set_gguf_parameters(self): - self.gguf_writer.add_name("T5") if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 @@ -3184,7 +3234,6 @@ class JaisModel(Model): self._set_vocab_gpt2() def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -3230,8 +3279,8 @@ class JaisModel(Model): return tensors - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) @@ -3390,7 +3439,6 @@ class ChatGLMModel(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_head_kv = self.hparams.get("multi_query_group_num", n_head) @@ -3584,6 +3632,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for an authorship metadata override file" + ) return parser.parse_args() @@ -3609,7 +3661,10 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) dir_model = args.model @@ -3633,34 +3688,30 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' + fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) with torch.inference_mode(): + output_type = ftype_map[args.outtype] + model_architecture = hparams["architectures"][0] + try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") + logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, + is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, model_name=args.model_name, + split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split) - logger.info("Set model parameters") - model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 5e1d06091..c9558faf8 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -91,6 +91,7 @@ models = [ {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, + {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, ] diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 4bb939d45..a88d0d4a9 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -251,6 +251,10 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out what will be done, without writing any new files", + ) parser.add_argument( "--base", type=Path, required=True, help="directory containing base model file", @@ -286,7 +290,7 @@ if __name__ == '__main__': fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' + fname_out = dir_lora if os.path.exists(input_model): # lazy import load_file only if lora is in safetensors format. @@ -310,6 +314,23 @@ if __name__ == '__main__': class LoraModel(model_class): model_arch = model_class.model_arch + lora_alpha: float + + def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs): + + super().__init__(*args, **kwargs) + + self.dir_model_card = dir_lora_model + self.lora_alpha = float(lora_alpha) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) + self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + + def set_gguf_parameters(self): + self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + super().set_gguf_parameters() + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -350,6 +371,11 @@ if __name__ == '__main__': yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha: float = lparams["lora_alpha"] + model_instance = LoraModel( dir_base_model, ftype, @@ -357,18 +383,11 @@ if __name__ == '__main__': is_big_endian=args.bigendian, use_temp_file=False, eager=args.no_lazy, - model_name=None, + dry_run=args.dry_run, + dir_lora_model=dir_lora, + lora_alpha=alpha, ) - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - - alpha = lparams["lora_alpha"] - - model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) - model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") - model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() logger.info(f"Model successfully exported to {model_instance.fname_out}") diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c2c73e8ad..9ab9ab06e 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -24,7 +24,7 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar import numpy as np @@ -346,42 +346,6 @@ class Params: return params -@dataclass -class Metadata: - name: Optional[str] = None - author: Optional[str] = None - version: Optional[str] = None - url: Optional[str] = None - description: Optional[str] = None - license: Optional[str] = None - source_url: Optional[str] = None - source_hf_repo: Optional[str] = None - - @staticmethod - def load(metadata_path: Path) -> Metadata: - if metadata_path is None or not metadata_path.exists(): - return Metadata() - - with open(metadata_path, 'r') as file: - data = json.load(file) - - # Create a new Metadata instance - metadata = Metadata() - - # Assigning values to Metadata attributes if they exist in the JSON file - # This is based on LLM_KV_NAMES mapping in llama.cpp - metadata.name = data.get("general.name") - metadata.author = data.get("general.author") - metadata.version = data.get("general.version") - metadata.url = data.get("general.url") - metadata.description = data.get("general.description") - metadata.license = data.get("general.license") - metadata.source_url = data.get("general.source.url") - metadata.source_hf_repo = data.get("general.source.huggingface.repository") - - return metadata - - # # data loading # TODO: reuse (probably move to gguf.py?) @@ -806,7 +770,7 @@ class OutputFile: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - def add_meta_model(self, params: Params, metadata: Metadata | None) -> None: + def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None: # Metadata About The Model And Its Provenence name = "LLaMA" if metadata is not None and metadata.name is not None: @@ -824,16 +788,73 @@ class OutputFile: self.gguf.add_author(metadata.author) if metadata.version is not None: self.gguf.add_version(metadata.version) - if metadata.url is not None: - self.gguf.add_url(metadata.url) + if metadata.organization is not None: + self.gguf.add_organization(metadata.organization) + + if metadata.finetune is not None: + self.gguf.add_finetune(metadata.finetune) + if metadata.basename is not None: + self.gguf.add_basename(metadata.basename) + if metadata.description is not None: self.gguf.add_description(metadata.description) + if metadata.quantized_by is not None: + self.gguf.add_quantized_by(metadata.quantized_by) + + if metadata.size_label is not None: + self.gguf.add_size_label(metadata.size_label) + if metadata.license is not None: - self.gguf.add_licence(metadata.license) + self.gguf.add_license(metadata.license) + if metadata.license_name is not None: + self.gguf.add_license_name(metadata.license_name) + if metadata.license_link is not None: + self.gguf.add_license_link(metadata.license_link) + + if metadata.url is not None: + self.gguf.add_url(metadata.url) + if metadata.doi is not None: + self.gguf.add_doi(metadata.doi) + if metadata.uuid is not None: + self.gguf.add_uuid(metadata.uuid) + if metadata.repo_url is not None: + self.gguf.add_repo_url(metadata.repo_url) + if metadata.source_url is not None: self.gguf.add_source_url(metadata.source_url) - if metadata.source_hf_repo is not None: - self.gguf.add_source_hf_repo(metadata.source_hf_repo) + if metadata.source_doi is not None: + self.gguf.add_source_doi(metadata.source_doi) + if metadata.source_uuid is not None: + self.gguf.add_source_uuid(metadata.source_uuid) + if metadata.source_repo_url is not None: + self.gguf.add_source_repo_url(metadata.source_repo_url) + + if metadata.base_models is not None: + self.gguf.add_base_model_count(len(metadata.base_models)) + for key, base_model_entry in enumerate(metadata.base_models): + if "name" in base_model_entry: + self.gguf.add_base_model_name(key, base_model_entry["name"]) + if "author" in base_model_entry: + self.gguf.add_base_model_author(key, base_model_entry["author"]) + if "version" in base_model_entry: + self.gguf.add_base_model_version(key, base_model_entry["version"]) + if "organization" in base_model_entry: + self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "url" in base_model_entry: + self.gguf.add_base_model_url(key, base_model_entry["url"]) + if "doi" in base_model_entry: + self.gguf.add_base_model_doi(key, base_model_entry["doi"]) + if "uuid" in base_model_entry: + self.gguf.add_base_model_uuid(key, base_model_entry["uuid"]) + if "repo_url" in base_model_entry: + self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + + if metadata.tags is not None: + self.gguf.add_tags(metadata.tags) + if metadata.languages is not None: + self.gguf.add_languages(metadata.languages) + if metadata.datasets is not None: + self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself @@ -944,7 +965,7 @@ class OutputFile: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -978,7 +999,7 @@ class OutputFile: fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - metadata: Metadata | None = None, + metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise ValueError(f"Unexpected combination of types: {name_to_type}") -def model_parameter_count(model: LazyModel) -> int: - total_model_parameters = 0 - for i, (name, lazy_tensor) in enumerate(model.items()): - sum_weights_in_tensor = 1 +def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]: + total_params = 0 + shared_params = 0 + expert_params = 0 + + for name, lazy_tensor in tensors: + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Got A Tensor + sum_weights_in_tensor: int = 1 + + # Tensor Volume for dim in lazy_tensor.shape: sum_weights_in_tensor *= dim - total_model_parameters += sum_weights_in_tensor - return total_model_parameters + if ".experts." in name: + if ".experts.0." in name: + expert_params += sum_weights_in_tensor + else: + shared_params += sum_weights_in_tensor -def model_parameter_count_rounded_notation(model_params_count: int) -> str: - if model_params_count > 1e12 : - # Trillions Of Parameters - scaled_model_params = model_params_count * 1e-12 - scale_suffix = "T" - elif model_params_count > 1e9 : - # Billions Of Parameters - scaled_model_params = model_params_count * 1e-9 - scale_suffix = "B" - elif model_params_count > 1e6 : - # Millions Of Parameters - scaled_model_params = model_params_count * 1e-6 - scale_suffix = "M" - else: - # Thousands Of Parameters - scaled_model_params = model_params_count * 1e-3 - scale_suffix = "K" + total_params += sum_weights_in_tensor - return f"{round(scaled_model_params)}{scale_suffix}" + return total_params, shared_params, expert_params def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1231,34 +1249,24 @@ class VocabFactory: return vocab, special_vocab -def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str: - quantization = { +def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str: + name = metadata.name if metadata.name is not None else None + basename = metadata.basename if metadata.basename is not None else None + finetune = metadata.finetune if metadata.finetune is not None else None + version = metadata.version if metadata.version is not None else None + size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0) + + output_type = { GGMLFileType.AllF32: "F32", GGMLFileType.MostlyF16: "F16", GGMLFileType.MostlyQ8_0: "Q8_0", }[file_type] - parameters = model_parameter_count_rounded_notation(model_params_count) - - expert_count = "" - if params.n_experts is not None: - expert_count = f"{params.n_experts}x" - - version = "" - if metadata is not None and metadata.version is not None: - version = f"-{metadata.version}" - - name = "ggml-model" - if metadata is not None and metadata.name is not None: - name = metadata.name - elif params.path_model is not None: - name = params.path_model.name - - return f"{name}{version}-{expert_count}{parameters}-{quantization}" + return gguf.naming_convention(name, basename, finetune, version, size_label, output_type) -def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path: - default_filename = default_convention_outfile(file_type, params, model_params_count, metadata) +def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path: + default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata) ret = model_paths[0].parent / f"{default_filename}.gguf" if ret in model_paths: logger.error( @@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") + parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") + parser.add_argument("--model-name", type=str, default=None, help="name of the model") args = parser.parse_args(args_in) @@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None: else: logging.basicConfig(level=logging.INFO) - metadata = Metadata.load(args.metadata) + model_name = args.model_name + dir_model = args.model + + metadata = gguf.Metadata.load(args.metadata, dir_model, model_name) if args.get_outfile: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) params = Params.load(model_plus) - model = convert_model_names(model_plus.model, params, args.skip_unknown) - model_params_count = model_parameter_count(model_plus.model) - ftype = pick_output_type(model, args.outtype) - print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100 + model = convert_model_names(model_plus.model, params, args.skip_unknown) + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + ftype = pick_output_type(model, args.outtype) + + if (metadata is None or metadata.name is None) and params.path_model is not None: + metadata.name = params.path_model.name + + print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100 return if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: - model_plus = lazy_load_file(args.model) + model_plus = lazy_load_file(dir_model) do_dump_model(model_plus) return if not args.vocab_only: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - model_params_count = model_parameter_count(model_plus.model) - logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})") + model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) @@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_path = Path(args.vocab_dir or dir_model or model_parent_path) vocab_factory = VocabFactory(vocab_path) vocab_types = None if args.no_vocab else args.vocab_type.split(",") vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) @@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None: assert params is not None + if metadata.name is None and params.path_model is not None: + metadata.name = params.path_model.name + + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})") + logger.info(f"Vocab info: {vocab}") logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) + outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata) + + metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0) params.ftype = ftype logger.info(f"Writing {outfile}, format {ftype}") diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 575143771..7498f85ef 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + if (!ctx) { + fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); + return false; + } + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 92a6b16b1..2aafe2316 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - return env->NewStringUTF(""); + return nullptr; } auto new_token_chars = llama_token_to_piece(context, new_token_id); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 2a3f9f758..58c32ca53 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -26,11 +26,12 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + var is_done: Bool = false /// This variable is used to store temporarily invalid cchars private var temporary_invalid_cchars: [CChar] - var n_len: Int32 = 64 + var n_len: Int32 = 1024 var n_cur: Int32 = 0 var n_decode: Int32 = 0 @@ -160,6 +161,7 @@ actor LlamaContext { if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") + is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() return new_token_str diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 2c1e3f61b..b8f6a31d5 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -132,7 +132,7 @@ class LlamaState: ObservableObject { messageLog += "\(text)" Task.detached { - while await llamaContext.n_cur < llamaContext.n_len { + while await !llamaContext.is_done { let result = await llamaContext.completion_loop() await MainActor.run { self.messageLog += "\(result)" diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py old mode 100644 new mode 100755 index 504ed98df..eb000d5cc --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -1,8 +1,15 @@ -# Function calling example using pydantic models. +#!/usr/bin/env python3 + +"""Function calling example using pydantic models.""" + from __future__ import annotations +import argparse import datetime import json +import logging +import textwrap +import sys from enum import Enum from typing import Optional, Union @@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) -# Function to get completion on the llama.cpp server with grammar. -def create_completion(prompt, grammar): +def create_completion(host, prompt, gbnf_grammar): + """Calls the /completion API on llama-server. + + See + https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + """ + print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} - data = {"prompt": prompt, "grammar": grammar} - - response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) - data = response.json() - + data = {"prompt": prompt, "grammar": gbnf_grammar} + result = requests.post(f"http://{host}/completion", headers=headers, json=data).json() assert data.get("error") is None, data - - print(data["content"]) - return data["content"] + logging.info("Result: %s", result) + content = result["content"] + print(f" Model: {result['model']}") + print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}") + return content # A function for the agent to send a message to the user. class SendMessageToUser(BaseModel): - """ - Send a message to the User. - """ + """Send a message to the User.""" chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") message: str = Field(..., description="Message you want to send to the user.") def run(self): - print(self.message) + print(f"SendMessageToUser: {self.message}") + + +def example_rce(host): + """Minimal test case where the LLM call an arbitrary python function.""" + print("- example_rce") + tools = [SendMessageToUser] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + tools_map = {tool.__name__:tool for tool in tools} + # This finds "SendMessageToUser": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + tool(**json_data["function_parameters"]).run() + return 0 # Enum for the calculator tool. @@ -46,11 +77,11 @@ class MathOperation(Enum): DIVIDE = "divide" -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. +# Simple pydantic calculator tool for the agent that can add, subtract, +# multiply, and divide. Docstring and description of fields will be used in +# system prompt. class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ + """Perform a math operation on two numbers.""" number_one: Union[int, float] = Field(..., description="First number.") operation: MathOperation = Field(..., description="Math operation to perform.") number_two: Union[int, float] = Field(..., description="Second number.") @@ -68,55 +99,61 @@ class Calculator(BaseModel): raise ValueError("Unknown operation.") -# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. -# pydantic_model_list is the list of pydanitc models -# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated -# outer_object_content is the name of outer object content. -# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") -# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", - outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") +def example_calculator(host): + """Have the LLM ask to get a calculation done. -print(gbnf_grammar) -print(documentation) + Here the grammar gets generated by passing the available function models to + generate_gbnf_grammar_and_documentation function. This also generates a + documentation usable by the LLM. -system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + pydantic_model_list is the list of pydantic models outer_object_name is an + optional name for an outer object around the actual model object. Like a + "function" object with "function_parameters" which contains the actual model + object. If None, no outer object will be generated outer_object_content is + the name of outer object content. -user_message = "What is 42 * 42?" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) -# This should output something like this: -# { -# "function": "calculator", -# "function_parameters": { -# "number_one": 42, -# "operation": "multiply", -# "number_two": 42 -# } -# } -function_dictionary = json.loads(text) -if function_dictionary["function"] == "calculator": - function_parameters = {**function_dictionary["function_parameters"]} - - print(Calculator(**function_parameters).run()) - # This should output: 1764 + model_prefix is the optional prefix for models in the documentation. (Default="Output Model") + fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") + """ + print("- example_calculator") + tools = [SendMessageToUser, Calculator] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message1 = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = { + "function": "Calculator", + "function_parameters": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + if json_data != expected: + print(" Result is not as expected!") + tools_map = {tool.__name__:tool for tool in tools} + # This finds "Calculator": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + result = tool(**json_data["function_parameters"]).run() + print(f" Call {json_data['function']} gave result {result}") + return 0 -# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. class Category(Enum): - """ - The category of the book. - """ + """The category of the book.""" Fiction = "Fiction" NonFiction = "Non-Fiction" class Book(BaseModel): - """ - Represents an entry about a book. - """ + """Represents an entry about a book.""" title: str = Field(..., description="Title of the book.") author: str = Field(..., description="Author of the book.") published_year: Optional[int] = Field(..., description="Publishing year of the book.") @@ -125,33 +162,42 @@ class Book(BaseModel): summary: str = Field(..., description="Summary of the book.") -# We need no additional parameters other than our list of pydantic models. -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) +def example_struct(host): + """A example structured output based on pydantic models. -system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + The LLM will create an entry for a Book database out of an unstructured + text. We need no additional parameters other than our list of pydantic + models. + """ + print("- example_struct") + tools = [Book] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools) + system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + # In this case, there's no function nor function_parameters. + # Here the result will vary based on the LLM used. + keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"]) + if keys != sorted(json_data.keys()): + print(f"Unexpected result: {sorted(json_data.keys())}") + return 1 + book = Book(**json_data) + print(f" As a Book object: %s" % book) + return 0 -text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(Book(**json_data)) -# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition. def get_current_datetime(output_format: Optional[str] = None): - """ - Get the current date and time in the given format. + """Get the current date and time in the given format. + Args: output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' """ - if output_format is None: - output_format = '%Y-%m-%d %H:%M:%S' - return datetime.datetime.now().strftime(output_format) + return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S") -# Example function to get the weather +# Example function to get the weather. def get_current_weather(location, unit): """Get the current weather in a given location""" if "London" in location: @@ -160,68 +206,107 @@ def get_current_weather(location, unit): return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) elif "North Pole" in location: return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) + return json.dumps({"location": location, "temperature": "unknown"}) -# Here is a function definition in OpenAI style -current_weather_tool = { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", +def example_concurrent(host): + """An example for parallel function calling with a Python function, a pydantic + function model and an OpenAI like function definition. + """ + print("- example_concurrent") + # Function definition in OpenAI style. + current_weather_tool = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "required": ["location"], }, - "required": ["location"], }, - }, -} + } + # Convert OpenAI function definition into pydantic model. + current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) + # Add the actual function to a pydantic model. + current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) -# Convert OpenAI function definition into pydantic model -current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) -# Add the actual function to a pydantic model -current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) + # Convert normal Python function to a pydantic model. + current_datetime_model = create_dynamic_model_from_function(get_current_datetime) -# Convert normal Python function to a pydantic model -current_datetime_model = create_dynamic_model_from_function(get_current_datetime) - -tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) + system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation + text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = [ + { + "function": "get_current_datetime", + "params": { + "output_format": "%Y-%m-%d %H:%M:%S" + } + }, + { + "function": "get_current_weather", + "params": { + "location": "London", + "unit": "celsius" + } + }, + { + "function": "Calculator", + "params": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + ] + res = 0 + if json_data != expected: + print(" Result is not as expected!") + print(" This can happen on highly quantized models") + res = 1 + tools_map = {tool.__name__:tool for tool in tools} + for call in json_data: + tool = tools_map.get(call["function"]) + if not tool: + print(f"Error: unknown tool {call['function']}") + return 1 + result = tool(**call["params"]).run() + print(f" Call {call['function']} returned {result}") + # Should output something like this: + # Call get_current_datetime returned 2024-07-15 09:50:38 + # Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"} + # Call Calculator returned 1764 + return res -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=tool_list, outer_object_name="function", - outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) - -system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation +def main(): + parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) + parser.add_argument("--host", default="localhost:8080", help="llama.cpp server") + parser.add_argument("-v", "--verbose", action="store_true", help="enables logging") + args = parser.parse_args() + logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR) + ret = 0 + # Comment out below to only run the example you want. + ret = ret or example_rce(args.host) + ret = ret or example_calculator(args.host) + ret = ret or example_struct(args.host) + ret = ret or example_concurrent(args.host) + return ret -text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(json_data) -# Should output something like this: -# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] - - -for call in json_data: - if call["function"] == "Calculator": - print(Calculator(**call["params"]).run()) - elif call["function"] == "get_current_datetime": - print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] - elif call["function"] == "get_current_weather": - print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] -# Should output something like this: -# 2024-01-14 13:36:06 -# {"location": "London", "temperature": "42", "unit": "celsius"} -# 1764 +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/server/README.md b/examples/server/README.md index e477d1501..ff4074517 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -444,7 +444,7 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. - `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 987b9a3b8..36818f764 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,7 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; if (!controller) { controller = new AbortController(); @@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 5513e9121..bf2b0a3f0 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -14,10 +14,10 @@