gguf.py : pick some of the refactoring from #2644

This commit is contained in:
Georgi Gerganov 2023-08-17 17:02:01 +03:00
parent 673ae1a17e
commit 39362f3485
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 63 additions and 68 deletions

View file

@ -705,19 +705,17 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile: class OutputFile:
def __init__(self, fname_out: Path) -> None: def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter.open(fname_out) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
arch = gguf.MODEL_ARCH_NAMES[ARCH] self.gguf.add_context_length (params.n_ctx)
self.gguf.add_architecture (arch) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_context_length (arch, params.n_ctx) self.gguf.add_block_count (params.n_layer)
self.gguf.add_embedding_length (arch, params.n_embd) self.gguf.add_feed_forward_length (params.n_ff)
self.gguf.add_block_count (arch, params.n_layer) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_feed_forward_length (arch, params.n_ff) self.gguf.add_head_count (params.n_head)
self.gguf.add_rope_dimension_count(arch, params.n_embd // params.n_head) self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_head_count (arch, params.n_head) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
self.gguf.add_head_count_kv (arch, params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (arch, params.f_norm_eps)
def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_vocab(self, vocab: Vocab) -> None:
tokens = [] tokens = []

111
gguf.py
View file

@ -33,24 +33,24 @@ KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM # LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length" KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length" KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count" KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length" KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual" KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout" KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
# attention # attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count" KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv" KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias" KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv" KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon" KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon" KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count" KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale" KEY_ROPE_SCALE = "{arch}.rope.scale"
# tokenization # tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
@ -343,14 +343,16 @@ class GGUFValueType(IntEnum):
class GGUFWriter: class GGUFWriter:
def __init__(self, fout: IO): def __init__(self, path: str, arch: str):
self.fout = fout self.fout = open(path, "wb")
self.arch = arch
self.offset_tensor = 0 self.offset_tensor = 0
self.data_alignment = GGUF_DEFAULT_ALIGNMENT self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = b"" self.kv_data = b""
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = b"" self.ti_data = b""
self.ti_data_count = 0 self.ti_data_count = 0
self.add_architecture()
def write_header_to_file(self): def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC)) self.fout.write(struct.pack("<I", GGUF_MAGIC))
@ -368,11 +370,6 @@ class GGUFWriter:
self.fout.write(self.ti_data) self.fout.write(self.ti_data)
self.flush() self.flush()
@classmethod
def open(cls, path: str) -> "GGUFWriter":
f = open(path, "wb")
return cls(f)
def add_key(self, key: str): def add_key(self, key: str):
self.add_val(key, GGUFValueType.STRING, add_vtype=False) self.add_val(key, GGUFValueType.STRING, add_vtype=False)
@ -409,7 +406,8 @@ class GGUFWriter:
self.add_val(val, GGUFValueType.BOOL) self.add_val(val, GGUFValueType.BOOL)
def add_string(self, key: str, val: str): def add_string(self, key: str, val: str):
if len(val) == 0: return if len(val) == 0:
return
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.STRING) self.add_val(val, GGUFValueType.STRING)
@ -463,6 +461,8 @@ class GGUFWriter:
return ((x + n - 1) // n) * n return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int): def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<I", len(encoded_name)) self.ti_data += struct.pack("<I", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
@ -471,7 +471,6 @@ class GGUFWriter:
for i in range(n_dims): for i in range(n_dims):
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i]) self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
self.ti_data += struct.pack("<I", dtype) self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor) self.ti_data += struct.pack("<Q", self.offset_tensor)
@ -495,15 +494,14 @@ class GGUFWriter:
def close(self): def close(self):
self.fout.close() self.fout.close()
def add_architecture(self, architecture: str): def add_architecture(self):
self.add_string(KEY_GENERAL_ARCHITECTURE, self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
architecture)
def add_author(self, author: str): def add_author(self, author: str):
self.add_string(KEY_GENERAL_AUTHOR, author) self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout) self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(KEY_GENERAL_URL, url) self.add_string(KEY_GENERAL_URL, url)
@ -531,60 +529,60 @@ class GGUFWriter:
self.data_alignment = alignment self.data_alignment = alignment
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment) self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
def add_context_length(self, llm: str, length: int): def add_context_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length) KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
def add_embedding_length(self, llm: str, length: int): def add_embedding_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length) KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_block_count(self, llm: str, length: int): def add_block_count(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_BLOCK_COUNT.format(llm=llm), length) KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
def add_feed_forward_length(self, llm: str, length: int): def add_feed_forward_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length) KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
def add_parallel_residual(self, llm: str, use: bool): def add_parallel_residual(self, use: bool):
self.add_bool( self.add_bool(
KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use) KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tensor_data_layout(self, llm: str, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string( self.add_string(
KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout) KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_head_count(self, llm: str, count: int): def add_head_count(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
def add_head_count_kv(self, llm: str, count: int): def add_head_count_kv(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
def add_max_alibi_bias(self, llm: str, bias: float): def add_max_alibi_bias(self, bias: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias) KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
def add_clamp_kqv(self, llm: str, value: float): def add_clamp_kqv(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value) KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
def add_layer_norm_eps(self, llm: str, value: float): def add_layer_norm_eps(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
def add_layer_norm_rms_eps(self, llm: str, value: float): def add_layer_norm_rms_eps(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
def add_rope_dimension_count(self, llm: str, count: int): def add_rope_dimension_count(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count) KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_scale(self, llm: str, value: float): def add_rope_scale(self, value: float):
self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value) self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)
@ -619,9 +617,8 @@ class GGUFWriter:
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
# Example usage with a file # Example usage with a file
gguf_writer = GGUFWriter.open("example.gguf") gguf_writer = GGUFWriter("example.gguf", "llama")
gguf_writer.add_architecture("llama")
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
gguf_writer.add_custom_alignment(64) gguf_writer.add_custom_alignment(64)