From 050046fa4532a955f0847874a73530594f7257c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 24 Aug 2023 09:07:42 +0300 Subject: [PATCH 1/8] gitignore : add dist and rm pyproject.toml --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f3121794a..6cb7d9bc6 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,7 @@ compile_commands.json CMakeSettings.json __pycache__ +dist zig-out/ zig-cache/ @@ -70,7 +71,6 @@ perf-*.txt examples/jeopardy/results.txt -pyproject.toml poetry.lock poetry.toml From 5dd870574e9ac0296b36574b61a4a87ae2af5d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 24 Aug 2023 09:08:19 +0300 Subject: [PATCH 2/8] gguf: prepare as Pip package --- gguf.py | 723 -------------------------------------------------------- 1 file changed, 723 deletions(-) delete mode 100755 gguf.py diff --git a/gguf.py b/gguf.py deleted file mode 100755 index 5c37f0f0b..000000000 --- a/gguf.py +++ /dev/null @@ -1,723 +0,0 @@ -#!/usr/bin/env python3 -import shutil -import sys -import struct -import tempfile -import numpy as np - -from enum import IntEnum, auto -from typing import Any, IO, List, Optional - -# -# constants -# - -GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 1 -GGUF_DEFAULT_ALIGNMENT = 32 - -# general -KEY_GENERAL_ARCHITECTURE = "general.architecture" -KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" -KEY_GENERAL_ALIGNMENT = "general.alignment" -KEY_GENERAL_NAME = "general.name" -KEY_GENERAL_AUTHOR = "general.author" -KEY_GENERAL_URL = "general.url" -KEY_GENERAL_DESCRIPTION = "general.description" -KEY_GENERAL_LICENSE = "general.license" -KEY_GENERAL_SOURCE_URL = "general.source.url" -KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" -KEY_GENERAL_FILE_TYPE = "general.file_type" - -# LLM -KEY_CONTEXT_LENGTH = "{arch}.context_length" -KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" -KEY_BLOCK_COUNT = "{arch}.block_count" -KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" -KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" -KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - -# attention -KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" -KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv" -KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" -KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv" -KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" -KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - -# RoPE -KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" -KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" - -# tokenization -KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" -KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens" -KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type" -KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores" -KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges" -KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id" -KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id" -KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id" -KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id" -KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id" -KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json" -KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" - - -# -# recommended mapping of model tensor names for storage in gguf -# - - -class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - - -class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_NORM = auto() - - -MODEL_ARCH_NAMES = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", -} - -MODEL_TENSOR_NAMES = { - MODEL_ARCH.LLAMA: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.GPTNEOX: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.FALCON: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.GPT2: { - # TODO - }, - # TODO -} - -# tensors that will not be serialized -MODEL_TENSOR_SKIP = { - MODEL_ARCH.LLAMA: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], -} - - -# TODO: the following helper functions should be removed -# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR) -# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions -# REMOVE -def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool: - for skip in MODEL_TENSOR_SKIP.get(arch, []): - for i in range(n_blocks): - if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i): - return True - - return False - - -def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict: - tensor_map = {} - - # Token embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None) - - tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox - tensor_map["transformer.wte"] = mapped_to # gpt2 mpt - tensor_map["transformer.word_embeddings"] = mapped_to # falcon - tensor_map["model.embed_tokens"] = mapped_to # llama-hf - tensor_map["tok_embeddings"] = mapped_to # llama-pth - - # Position embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None) - - tensor_map["transformer.wpe"] = mapped_to # gpt2 - - # Output - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None) - - tensor_map["embed_out"] = mapped_to # gptneox - tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf - tensor_map["output"] = mapped_to # llama-pth - - # Output norm - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None) - - tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox - tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon - tensor_map["transformer.norm_f"] = mapped_to # mpt - tensor_map["model.norm"] = mapped_to # llama-hf - tensor_map["norm"] = mapped_to # llama-pth - - # Rope frequencies - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None) - - tensor_map["rope.freqs"] = mapped_to # llama-pth - - # Attention and feed-forward blocks - for i in range(0, n_blocks): - # Attention norm - # TODO: is there are simpler way to write these 2 lines in Python? - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None) - mapped_to = mapped_to.format(bid=i) if mapped_to else None - - tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b - tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b - tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth - - # Attention norm 2 - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b - - # Attention query-key-value - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon - - # Attention query - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth - - # Attention key - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth - - # Attention value - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth - - # Attention output - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth - - # Rotary embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth - - # Feed-forward norm - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt - tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth - - # Feed-forward up - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth - - # Feed-forward gate - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth - - # Feed-forward down - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth - - return tensor_map - - -class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - -# -# implementation -# - - -class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 - - -class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 - FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - - @staticmethod - def get_type(val): - if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray): - return GGUFValueType.STRING - elif isinstance(val, list): - return GGUFValueType.ARRAY - elif isinstance(val, float): - return GGUFValueType.FLOAT32 - elif isinstance(val, bool): - return GGUFValueType.BOOL - elif isinstance(val, int): - return GGUFValueType.INT32 - else: - print("Unknown type: "+str(type(val))) - sys.exit() - - -class GGUFWriter: - def __init__(self, path: str, arch: str, use_temp_file = True): - self.fout = open(path, "wb") - self.arch = arch - self.offset_tensor = 0 - self.data_alignment = GGUF_DEFAULT_ALIGNMENT - self.kv_data = b"" - self.kv_data_count = 0 - self.ti_data = b"" - self.ti_data_count = 0 - self.add_architecture() - self.use_temp_file = use_temp_file - self.tensors = [] - - def write_header_to_file(self): - self.fout.write(struct.pack(" int: - return ((x + n - 1) // n) * n - - def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): - assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" - - encoded_name = name.encode("utf8") - self.ti_data += struct.pack(" Date: Thu, 24 Aug 2023 09:09:52 +0300 Subject: [PATCH 3/8] gguf: prepare as Pip package --- gguf-py/LICENSE | 21 ++ gguf-py/README.md | 34 ++ gguf-py/gguf/__init__.py | 3 + gguf-py/gguf/gguf.py | 723 +++++++++++++++++++++++++++++++++++++ gguf-py/pyproject.toml | 28 ++ gguf-py/tests/test_gguf.py | 7 + 6 files changed, 816 insertions(+) create mode 100644 gguf-py/LICENSE create mode 100644 gguf-py/README.md create mode 100644 gguf-py/gguf/__init__.py create mode 100644 gguf-py/gguf/gguf.py create mode 100644 gguf-py/pyproject.toml create mode 100644 gguf-py/tests/test_gguf.py diff --git a/gguf-py/LICENSE b/gguf-py/LICENSE new file mode 100644 index 000000000..4d1c4c326 --- /dev/null +++ b/gguf-py/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/gguf-py/README.md b/gguf-py/README.md new file mode 100644 index 000000000..9b6870fa3 --- /dev/null +++ b/gguf-py/README.md @@ -0,0 +1,34 @@ +## gguf + +This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +(GGML Universal File) format. + +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) +as an example for its usage. + +## Install +```sh +pip install gguf +``` + +## Development +Maintainers who participate in development of this package are advised to install it in editable mode: + + +```sh +cd /path/to/llama.cpp/gguf + +pip install --editable . +``` + +**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. +In this case, upgrade Pip to the latest: + +```sh +pip install --upgrade pip +``` + +## TODO + +- [ ] Add tests +- [ ] Include conversion scripts as command line entry points in this package. diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py new file mode 100644 index 000000000..cfbad8ed0 --- /dev/null +++ b/gguf-py/gguf/__init__.py @@ -0,0 +1,3 @@ +from .gguf import GGUFWriter + +__version__ = '0.1.0' diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py new file mode 100644 index 000000000..5c37f0f0b --- /dev/null +++ b/gguf-py/gguf/gguf.py @@ -0,0 +1,723 @@ +#!/usr/bin/env python3 +import shutil +import sys +import struct +import tempfile +import numpy as np + +from enum import IntEnum, auto +from typing import Any, IO, List, Optional + +# +# constants +# + +GGUF_MAGIC = 0x46554747 +GGUF_VERSION = 1 +GGUF_DEFAULT_ALIGNMENT = 32 + +# general +KEY_GENERAL_ARCHITECTURE = "general.architecture" +KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" +KEY_GENERAL_ALIGNMENT = "general.alignment" +KEY_GENERAL_NAME = "general.name" +KEY_GENERAL_AUTHOR = "general.author" +KEY_GENERAL_URL = "general.url" +KEY_GENERAL_DESCRIPTION = "general.description" +KEY_GENERAL_LICENSE = "general.license" +KEY_GENERAL_SOURCE_URL = "general.source.url" +KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_FILE_TYPE = "general.file_type" + +# LLM +KEY_CONTEXT_LENGTH = "{arch}.context_length" +KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" +KEY_BLOCK_COUNT = "{arch}.block_count" +KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" +KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" +KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + +# attention +KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" +KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv" +KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" +KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv" +KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" +KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + +# RoPE +KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" +KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" + +# tokenization +KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" +KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens" +KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type" +KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores" +KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges" +KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id" +KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id" +KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id" +KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id" +KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id" +KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json" +KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" + + +# +# recommended mapping of model tensor names for storage in gguf +# + + +class MODEL_ARCH(IntEnum): + LLAMA = auto() + FALCON = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + + +class MODEL_TENSOR(IntEnum): + TOKEN_EMBD = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_NORM = auto() + + +MODEL_ARCH_NAMES = { + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", +} + +MODEL_TENSOR_NAMES = { + MODEL_ARCH.LLAMA: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.GPTNEOX: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.FALCON: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.GPT2: { + # TODO + }, + # TODO +} + +# tensors that will not be serialized +MODEL_TENSOR_SKIP = { + MODEL_ARCH.LLAMA: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], +} + + +# TODO: the following helper functions should be removed +# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR) +# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions +# REMOVE +def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool: + for skip in MODEL_TENSOR_SKIP.get(arch, []): + for i in range(n_blocks): + if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i): + return True + + return False + + +def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict: + tensor_map = {} + + # Token embeddings + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None) + + tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox + tensor_map["transformer.wte"] = mapped_to # gpt2 mpt + tensor_map["transformer.word_embeddings"] = mapped_to # falcon + tensor_map["model.embed_tokens"] = mapped_to # llama-hf + tensor_map["tok_embeddings"] = mapped_to # llama-pth + + # Position embeddings + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None) + + tensor_map["transformer.wpe"] = mapped_to # gpt2 + + # Output + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None) + + tensor_map["embed_out"] = mapped_to # gptneox + tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf + tensor_map["output"] = mapped_to # llama-pth + + # Output norm + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None) + + tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox + tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon + tensor_map["transformer.norm_f"] = mapped_to # mpt + tensor_map["model.norm"] = mapped_to # llama-hf + tensor_map["norm"] = mapped_to # llama-pth + + # Rope frequencies + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None) + + tensor_map["rope.freqs"] = mapped_to # llama-pth + + # Attention and feed-forward blocks + for i in range(0, n_blocks): + # Attention norm + # TODO: is there are simpler way to write these 2 lines in Python? + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None) + mapped_to = mapped_to.format(bid=i) if mapped_to else None + + tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt + tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b + tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b + tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth + + # Attention norm 2 + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b + + # Attention query-key-value + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt + tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon + + # Attention query + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth + + # Attention key + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth + + # Attention value + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth + + # Attention output + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt + tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon + tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth + + # Rotary embeddings + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth + + # Feed-forward norm + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt + tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth + + # Feed-forward up + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt + tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon + tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth + + # Feed-forward gate + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth + + # Feed-forward down + mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None) + mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + + tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox + tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2 + tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt + tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon + tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf + tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth + + return tensor_map + + +class TokenType(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + +# +# implementation +# + + +class GGMLQuantizationType(IntEnum): + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 + + +class GGUFValueType(IntEnum): + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 + FLOAT32 = 6 + BOOL = 7 + STRING = 8 + ARRAY = 9 + + @staticmethod + def get_type(val): + if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray): + return GGUFValueType.STRING + elif isinstance(val, list): + return GGUFValueType.ARRAY + elif isinstance(val, float): + return GGUFValueType.FLOAT32 + elif isinstance(val, bool): + return GGUFValueType.BOOL + elif isinstance(val, int): + return GGUFValueType.INT32 + else: + print("Unknown type: "+str(type(val))) + sys.exit() + + +class GGUFWriter: + def __init__(self, path: str, arch: str, use_temp_file = True): + self.fout = open(path, "wb") + self.arch = arch + self.offset_tensor = 0 + self.data_alignment = GGUF_DEFAULT_ALIGNMENT + self.kv_data = b"" + self.kv_data_count = 0 + self.ti_data = b"" + self.ti_data_count = 0 + self.add_architecture() + self.use_temp_file = use_temp_file + self.tensors = [] + + def write_header_to_file(self): + self.fout.write(struct.pack(" int: + return ((x + n - 1) // n) * n + + def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): + assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" + + encoded_name = name.encode("utf8") + self.ti_data += struct.pack(""] +packages = [ + {include = "gguf"}, +] +readme = "README.md" +homepage = "https://ggml.ai" +repository = "https://github.com/ggerganov/llama.cpp" +keywords = ["ggml", "gguf", "llama.cpp"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[tool.poetry.dependencies] +python = ">=3.8" +numpy = ">=1.17" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py new file mode 100644 index 000000000..2296ab599 --- /dev/null +++ b/gguf-py/tests/test_gguf.py @@ -0,0 +1,7 @@ +from gguf import __version__ + +# TODO: add tests + + +def test_version(): + assert __version__ == '0.1.0' From 0288361b65a094501ae9a36d57b5e9fc96429cc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 24 Aug 2023 09:26:13 +0300 Subject: [PATCH 4/8] gguf : fix line endings --- gguf-py/LICENSE | 42 +++++++++++------------ gguf-py/README.md | 68 +++++++++++++++++++------------------- gguf-py/gguf/__init__.py | 6 ++-- gguf-py/pyproject.toml | 56 +++++++++++++++---------------- gguf-py/tests/test_gguf.py | 14 ++++---- 5 files changed, 93 insertions(+), 93 deletions(-) diff --git a/gguf-py/LICENSE b/gguf-py/LICENSE index 4d1c4c326..76f67efdc 100644 --- a/gguf-py/LICENSE +++ b/gguf-py/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2023 Georgi Gerganov - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +MIT License + +Copyright (c) 2023 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/gguf-py/README.md b/gguf-py/README.md index 9b6870fa3..3e1c27d4e 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -1,34 +1,34 @@ -## gguf - -This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) -(GGML Universal File) format. - -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) -as an example for its usage. - -## Install -```sh -pip install gguf -``` - -## Development -Maintainers who participate in development of this package are advised to install it in editable mode: - - -```sh -cd /path/to/llama.cpp/gguf - -pip install --editable . -``` - -**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. -In this case, upgrade Pip to the latest: - -```sh -pip install --upgrade pip -``` - -## TODO - -- [ ] Add tests -- [ ] Include conversion scripts as command line entry points in this package. +## gguf + +This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +(GGML Universal File) format. + +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) +as an example for its usage. + +## Install +```sh +pip install gguf +``` + +## Development +Maintainers who participate in development of this package are advised to install it in editable mode: + + +```sh +cd /path/to/llama.cpp/gguf-py + +pip install --editable . +``` + +**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. +In this case, upgrade Pip to the latest: + +```sh +pip install --upgrade pip +``` + +## TODO + +- [ ] Add tests +- [ ] Include conversion scripts as command line entry points in this package. diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index cfbad8ed0..1a7d93032 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -1,3 +1,3 @@ -from .gguf import GGUFWriter - -__version__ = '0.1.0' +from .gguf import GGUFWriter + +__version__ = '0.1.0' diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index de74adb9a..87605af26 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,28 +1,28 @@ -[tool.poetry] -name = "gguf" -version = "0.1.0" -description = "Write ML models in GGUF for GGML" -authors = ["GGML "] -packages = [ - {include = "gguf"}, -] -readme = "README.md" -homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" -keywords = ["ggml", "gguf", "llama.cpp"] -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] - -[tool.poetry.dependencies] -python = ">=3.8" -numpy = ">=1.17" - -[tool.poetry.dev-dependencies] -pytest = "^5.2" - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +[tool.poetry] +name = "gguf" +version = "0.1.0" +description = "Write ML models in GGUF for GGML" +authors = ["GGML "] +packages = [ + {include = "gguf"}, +] +readme = "README.md" +homepage = "https://ggml.ai" +repository = "https://github.com/ggerganov/llama.cpp" +keywords = ["ggml", "gguf", "llama.cpp"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[tool.poetry.dependencies] +python = ">=3.8" +numpy = ">=1.17" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py index 2296ab599..992cc1481 100644 --- a/gguf-py/tests/test_gguf.py +++ b/gguf-py/tests/test_gguf.py @@ -1,7 +1,7 @@ -from gguf import __version__ - -# TODO: add tests - - -def test_version(): - assert __version__ == '0.1.0' +from gguf import __version__ + +# TODO: add tests + + +def test_version(): + assert __version__ == '0.1.0' From 87338093d6bca7d9355d4dc6e887bc172823014c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 25 Aug 2023 08:47:19 +0300 Subject: [PATCH 5/8] requirements : add gguf --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 6c32cbd04..7dc51edb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy==1.24 sentencepiece==0.1.98 +gguf>=0.1.0 From 8798aea2475322d8a1f9dd399c55763e33b5a60b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 25 Aug 2023 09:02:36 +0300 Subject: [PATCH 6/8] gguf : update readme with build notes --- gguf-py/README.md | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/gguf-py/README.md b/gguf-py/README.md index 3e1c27d4e..6786d2907 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -6,7 +6,7 @@ This is a Python package for writing binary files in the [GGUF](https://github.c See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) as an example for its usage. -## Install +## Installation ```sh pip install gguf ``` @@ -14,7 +14,6 @@ pip install gguf ## Development Maintainers who participate in development of this package are advised to install it in editable mode: - ```sh cd /path/to/llama.cpp/gguf-py @@ -28,7 +27,28 @@ In this case, upgrade Pip to the latest: pip install --upgrade pip ``` -## TODO +## Publishing +To publish the package, you need to have `twine` and `build` installed: +```sh +pip install build twine +``` + +Then, folow these steps to release a new version: + +1. Update versions in `pyproject.toml` and `__init__.py`. +2. Build the package: + +```sh +python -m build +``` + +3. Upload the generated distribution archives: + +```sh +python -m twine upload dist/* +``` + +## TODO - [ ] Add tests - [ ] Include conversion scripts as command line entry points in this package. From 2897926d90ddefd95704fb1a34f8b71a06818733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 25 Aug 2023 09:06:33 +0300 Subject: [PATCH 7/8] gguf : update readme with build notes --- gguf-py/README.md | 3 ++- gguf-py/gguf/__init__.py | 2 -- gguf-py/pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gguf-py/README.md b/gguf-py/README.md index 6786d2907..03ad306ec 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -36,7 +36,7 @@ pip install build twine Then, folow these steps to release a new version: -1. Update versions in `pyproject.toml` and `__init__.py`. +1. Update the version in `pyproject.toml`. 2. Build the package: ```sh @@ -52,3 +52,4 @@ python -m twine upload dist/* ## TODO - [ ] Add tests - [ ] Include conversion scripts as command line entry points in this package. +- Add CI workflow for releasing the package. diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index 1a7d93032..718ea71eb 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -1,3 +1 @@ from .gguf import GGUFWriter - -__version__ = '0.1.0' diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 87605af26..a6bce9460 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.1.0" +version = "0.2.0" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 0248ca811e076ac0017e4cb35651ca6b57c3bfd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Fri, 25 Aug 2023 09:08:05 +0300 Subject: [PATCH 8/8] gguf : add notes for tests --- gguf-py/tests/test_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py index 992cc1481..512531dd2 100644 --- a/gguf-py/tests/test_gguf.py +++ b/gguf-py/tests/test_gguf.py @@ -1,7 +1,7 @@ -from gguf import __version__ +import gguf # TODO: add tests -def test_version(): - assert __version__ == '0.1.0' +def test_write_gguf(): + pass