Merge branch 'master' into gg/add-tokenizer-test-script
This commit is contained in:
commit
26f606efed
29 changed files with 677 additions and 533 deletions
3
.flake8
3
.flake8
|
@ -1,3 +1,4 @@
|
|||
[flake8]
|
||||
max-line-length = 125
|
||||
ignore = W503
|
||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||
exclude = examples/*,examples/*/**,*/**/__init__.py
|
||||
|
|
3
.github/workflows/python-lint.yml
vendored
3
.github/workflows/python-lint.yml
vendored
|
@ -20,5 +20,4 @@ jobs:
|
|||
- name: flake8 Lint
|
||||
uses: py-actions/flake8@v2
|
||||
with:
|
||||
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
|
||||
exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
|
||||
plugins: "flake8-no-print"
|
||||
|
|
|
@ -3,13 +3,14 @@
|
|||
exclude: prompts/.*.txt
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v3.2.0
|
||||
rev: v4.6.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- repo: https://github.com/PyCQA/flake8
|
||||
rev: 6.0.0
|
||||
rev: 7.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
additional_dependencies: [flake8-no-print]
|
||||
|
|
|
@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|||
// INTERNAL, DO NOT USE
|
||||
// USE LOG() INSTEAD
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||
#define LOG_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
|
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|||
// INTERNAL, DO NOT USE
|
||||
// USE LOG_TEE() INSTEAD
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
# TODO: automate the update of convert-hf-to-gguf.py
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
import sys
|
||||
|
@ -28,12 +29,17 @@ import json
|
|||
|
||||
from hashlib import sha256
|
||||
from enum import IntEnum, auto
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
|
||||
|
||||
class TOKENIZER_TYPE(IntEnum):
|
||||
SPM = auto()
|
||||
BPE = auto()
|
||||
WPM = auto()
|
||||
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
@ -41,37 +47,39 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶
|
|||
if len(sys.argv) == 2:
|
||||
token = sys.argv[1]
|
||||
else:
|
||||
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: add models here, base models preferred
|
||||
models = [
|
||||
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{ "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{ "name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
]
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
if not os.path.exists("models/tokenizers"):
|
||||
os.makedirs("models/tokenizers")
|
||||
|
||||
|
||||
def download_file_with_auth(url, token, save_path):
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
print(f"File {save_path} downloaded successfully")
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
else:
|
||||
print(f"Failed to download file. Status code: {response.status_code}")
|
||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
||||
|
||||
|
||||
# download the tokenizer models
|
||||
for model in models:
|
||||
|
@ -82,10 +90,10 @@ for model in models:
|
|||
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||
os.makedirs(f"models/tokenizers/{name}")
|
||||
else:
|
||||
print(f"Directory models/tokenizers/{name} already exists - skipping")
|
||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
||||
continue
|
||||
|
||||
print(f"Downloading {name} to models/tokenizers/{name}")
|
||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
||||
|
||||
url = f"{repo}/raw/main/config.json"
|
||||
save_path = f"models/tokenizers/{name}/config.json"
|
||||
|
@ -116,76 +124,76 @@ for model in models:
|
|||
continue
|
||||
|
||||
# create the tokenizer
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
print(f"model: {name}")
|
||||
print(f"tokt: {tokt}")
|
||||
print(f"repo: {model['repo']}")
|
||||
print(f"chktok: {chktok}")
|
||||
print(f"chkhsh: {chkhsh}")
|
||||
logger.info(f"model: {name}")
|
||||
logger.info(f"tokt: {tokt}")
|
||||
logger.info(f"repo: {model['repo']}")
|
||||
logger.info(f"chktok: {chktok}")
|
||||
logger.info(f"chkhsh: {chkhsh}")
|
||||
|
||||
# print the "pre_tokenizer" content from the tokenizer.json
|
||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
pre_tokenizer = cfg["pre_tokenizer"]
|
||||
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
|
||||
print(f"\n")
|
||||
logger.info("")
|
||||
|
||||
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
|
||||
src_ifs += f" # ref: {model['repo']}\n"
|
||||
src_ifs += f" res = \"{name}\"\n"
|
||||
|
||||
src_func = ""
|
||||
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
|
||||
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
|
||||
src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
|
||||
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
|
||||
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
|
||||
src_func += "\n"
|
||||
src_func += f" chktxt = {repr(chktxt)}\n"
|
||||
src_func += "\n"
|
||||
src_func += " chktok = tokenizer.encode(chktxt)\n"
|
||||
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
|
||||
src_func += "\n"
|
||||
src_func += " print(f\"chktok: {chktok}\")\n"
|
||||
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
|
||||
src_func += "\n"
|
||||
src_func += " res = None\n"
|
||||
src_func += "\n"
|
||||
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
|
||||
src_func += " # or pull the latest version of the model from Huggingface\n"
|
||||
src_func += " # don't edit the hashes manually!\n"
|
||||
src_func += f"{src_ifs}\n"
|
||||
src_func += " if res is None:\n"
|
||||
src_func += " print(\"\\n\")\n"
|
||||
src_func += " print(\"**************************************************************************************\")\n"
|
||||
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
|
||||
src_func += " print(\"** There are 2 possible reasons for this:\")\n"
|
||||
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
|
||||
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n"
|
||||
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
|
||||
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
|
||||
src_func += " print(\"**\")\n"
|
||||
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
|
||||
src_func += " print(\"**************************************************************************************\")\n"
|
||||
src_func += " print(\"\\n\")\n"
|
||||
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
|
||||
src_func += "\n"
|
||||
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
|
||||
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
|
||||
src_func += "\n"
|
||||
src_func += " return res\n"
|
||||
src_func = f"""
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||
# use in llama.cpp to implement the same pre-tokenizer
|
||||
|
||||
print(src_func)
|
||||
chktxt = {repr(chktxt)}
|
||||
|
||||
print("\n")
|
||||
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
print("\n")
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
print(f"chktok: {{chktok}}")
|
||||
print(f"chkhsh: {{chkhsh}}")
|
||||
|
||||
res = None
|
||||
|
||||
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
||||
# or pull the latest version of the model from Huggingface
|
||||
# don't edit the hashes manually!
|
||||
{src_ifs}
|
||||
if res is None:
|
||||
print("\\n")
|
||||
print("**************************************************************************************")
|
||||
print("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
print("** There are 2 possible reasons for this:")
|
||||
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
print("** - the pre-tokenization config has changed upstream")
|
||||
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
print("**")
|
||||
print(f"** chkhsh: {{chkhsh}}")
|
||||
print("**************************************************************************************")
|
||||
print("\\n")
|
||||
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
|
||||
|
||||
print(f"tokenizer.ggml.pre: {{repr(res)}}")
|
||||
print(f"chkhsh: {{chkhsh}}")
|
||||
|
||||
return res
|
||||
"""
|
||||
|
||||
print(src_func) # noqa: NP100
|
||||
|
||||
logger.info("\n")
|
||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
logger.info("\n")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
@ -253,7 +261,6 @@ for model in models:
|
|||
tokt = model["tokt"]
|
||||
|
||||
# create the tokenizer
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
|
||||
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
||||
|
@ -268,15 +275,15 @@ for model in models:
|
|||
f.write(f" {r}")
|
||||
f.write("\n")
|
||||
|
||||
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
|
||||
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
|
||||
|
||||
# generate commands for creating vocab files
|
||||
|
||||
print("\nRun the following commands to generate the vocab files for testing:\n")
|
||||
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
|
||||
|
||||
for model in models:
|
||||
name = model["name"]
|
||||
|
||||
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
|
||||
logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
|
||||
|
||||
print("\n")
|
||||
logger.info("\n")
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import contextlib
|
||||
import json
|
||||
|
@ -26,6 +27,8 @@ import gguf
|
|||
|
||||
from convert import LlamaHfVocab, permute
|
||||
|
||||
logger = logging.getLogger("hf-to-gguf")
|
||||
|
||||
|
||||
###### MODEL DEFINITIONS ######
|
||||
|
||||
|
@ -76,7 +79,7 @@ class Model(ABC):
|
|||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
for part_name in self.part_names:
|
||||
print(f"gguf: loading model part '{part_name}'")
|
||||
logger.info(f"gguf: loading model part '{part_name}'")
|
||||
ctx: ContextManager[Any]
|
||||
if self.is_safetensors:
|
||||
from safetensors import safe_open
|
||||
|
@ -95,42 +98,42 @@ class Model(ABC):
|
|||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
print(f"gguf: context length = {n_ctx}")
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
print(f"gguf: embedding length = {n_embd}")
|
||||
logger.info(f"gguf: embedding length = {n_embd}")
|
||||
|
||||
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||
print(f"gguf: feed forward length = {n_ff}")
|
||||
logger.info(f"gguf: feed forward length = {n_ff}")
|
||||
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
print(f"gguf: head count = {n_head}")
|
||||
logger.info(f"gguf: head count = {n_head}")
|
||||
|
||||
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
print(f"gguf: key-value head count = {n_head_kv}")
|
||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||
|
||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||
print(f"gguf: rope theta = {rope_theta}")
|
||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||
print(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
|
||||
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||
print(f"gguf: layer norm epsilon = {f_norm_eps}")
|
||||
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
|
||||
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
print(f"gguf: expert count = {n_experts}")
|
||||
logger.info(f"gguf: expert count = {n_experts}")
|
||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
print(f"gguf: experts used count = {n_experts_used}")
|
||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
print(f"gguf: file type = {self.ftype}")
|
||||
logger.info(f"gguf: file type = {self.ftype}")
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
|
@ -151,8 +154,7 @@ class Model(ABC):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -169,7 +171,7 @@ class Model(ABC):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -274,8 +276,8 @@ class Model(ABC):
|
|||
chktok = tokenizer.encode(chktxt)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
print(f"chktok: {chktok}")
|
||||
print(f"chkhsh: {chkhsh}")
|
||||
logger.debug(f"chktok: {chktok}")
|
||||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
res = None
|
||||
|
||||
|
@ -311,22 +313,22 @@ class Model(ABC):
|
|||
res = "refact"
|
||||
|
||||
if res is None:
|
||||
print("\n")
|
||||
print("**************************************************************************************")
|
||||
print("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
print("** There are 2 possible reasons for this:")
|
||||
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
print("** - the pre-tokenization config has changed upstream")
|
||||
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
print("**")
|
||||
print(f"** chkhsh: {chkhsh}")
|
||||
print("**************************************************************************************")
|
||||
print("\n")
|
||||
logger.warning("\n")
|
||||
logger.warning("**************************************************************************************")
|
||||
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
logger.warning("** There are 2 possible reasons for this:")
|
||||
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {chkhsh}")
|
||||
logger.warning("**************************************************************************************")
|
||||
logger.warning("\n")
|
||||
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
|
||||
|
||||
print(f"tokenizer.ggml.pre: {res}")
|
||||
print(f"chkhsh: {chkhsh}")
|
||||
logger.debug(f"tokenizer.ggml.pre: {res}")
|
||||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
return res
|
||||
|
||||
|
@ -442,9 +444,7 @@ class Model(ABC):
|
|||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
print(
|
||||
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
|
||||
)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(f"[PAD{i}]")
|
||||
scores.append(-1000.0)
|
||||
|
@ -556,7 +556,7 @@ class BloomModel(Model):
|
|||
),
|
||||
axis=0,
|
||||
)
|
||||
print("re-format attention.linear_qkv.weight")
|
||||
logger.info("re-format attention.linear_qkv.weight")
|
||||
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
||||
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
||||
data = np.concatenate(
|
||||
|
@ -567,13 +567,12 @@ class BloomModel(Model):
|
|||
),
|
||||
axis=0,
|
||||
)
|
||||
print("re-format attention.linear_qkv.bias")
|
||||
logger.info("re-format attention.linear_qkv.bias")
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -590,13 +589,13 @@ class BloomModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
if not has_lm_head and name == "word_embeddings.weight":
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
|
||||
@Model.register("MPTForCausalLM")
|
||||
|
@ -656,8 +655,7 @@ class MPTModel(Model):
|
|||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -674,7 +672,7 @@ class MPTModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -700,8 +698,7 @@ class OrionModel(Model):
|
|||
elif "model_max_length" in self.hparams:
|
||||
ctx_length = self.hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
sys.exit()
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
|
@ -739,8 +736,7 @@ class OrionModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -757,7 +753,7 @@ class OrionModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
|
@ -782,8 +778,7 @@ class BaichuanModel(Model):
|
|||
elif "model_max_length" in self.hparams:
|
||||
ctx_length = self.hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
sys.exit()
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
|
@ -812,7 +807,7 @@ class BaichuanModel(Model):
|
|||
|
||||
for i in range(block_count):
|
||||
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
logger.info(f"Unpacking and permuting layer {i}")
|
||||
model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
|
||||
self._reverse_hf_permute_part(w, 0, head_count, head_count)
|
||||
model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
|
||||
|
@ -837,8 +832,7 @@ class BaichuanModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -855,7 +849,7 @@ class BaichuanModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||
|
@ -940,8 +934,7 @@ class XverseModel(Model):
|
|||
elif "model_max_length" in self.hparams:
|
||||
ctx_length = self.hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
sys.exit()
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
|
@ -990,8 +983,7 @@ class XverseModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1008,7 +1000,7 @@ class XverseModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||
|
@ -1095,8 +1087,7 @@ class FalconModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1113,7 +1104,7 @@ class FalconModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1200,8 +1191,7 @@ class RefactModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1218,7 +1208,7 @@ class RefactModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1267,10 +1257,9 @@ class PersimmonModel(Model):
|
|||
data = data_torch.to(torch.float32).squeeze().numpy()
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
n_dims = len(data.shape)
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
|
@ -1335,8 +1324,7 @@ class StableLMModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1353,7 +1341,7 @@ class StableLMModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1369,8 +1357,7 @@ class StableLMModel(Model):
|
|||
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
||||
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||
data = data.astype(np.float32)
|
||||
|
||||
|
@ -1378,7 +1365,7 @@ class StableLMModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1430,7 +1417,7 @@ class LlamaModel(Model):
|
|||
experts = dict()
|
||||
for name, data_torch in self.get_tensors():
|
||||
# we don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
@ -1483,10 +1470,9 @@ class LlamaModel(Model):
|
|||
|
||||
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
continue
|
||||
|
@ -1494,8 +1480,7 @@ class LlamaModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1512,7 +1497,7 @@ class LlamaModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1587,10 +1572,9 @@ class GrokModel(Model):
|
|||
|
||||
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
continue
|
||||
|
@ -1598,8 +1582,7 @@ class GrokModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1616,7 +1599,7 @@ class GrokModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1649,7 +1632,7 @@ class DbrxModel(Model):
|
|||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
print(f"gguf: file type = {self.ftype}")
|
||||
logger.info(f"gguf: file type = {self.ftype}")
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers")
|
||||
|
@ -1692,8 +1675,7 @@ class DbrxModel(Model):
|
|||
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
|
||||
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1701,8 +1683,7 @@ class DbrxModel(Model):
|
|||
# Most of the codebase that takes in 1D tensors only handles F32 tensors
|
||||
# and most of the outputs tensors are F32.
|
||||
if data_dtype != np.float32 and n_dims == 1:
|
||||
print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
|
@ -1712,7 +1693,7 @@ class DbrxModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1774,8 +1755,7 @@ class MiniCPMModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1792,7 +1772,7 @@ class MiniCPMModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -1858,8 +1838,7 @@ class QwenModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1876,7 +1855,7 @@ class QwenModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
|
@ -1953,10 +1932,9 @@ class Qwen2MoeModel(Model):
|
|||
|
||||
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
continue
|
||||
|
@ -1964,8 +1942,7 @@ class Qwen2MoeModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -1982,7 +1959,7 @@ class Qwen2MoeModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -2027,8 +2004,7 @@ class GPT2Model(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -2045,13 +2021,13 @@ class GPT2Model(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
# note: GPT2 output is tied to (same as) wte in original model
|
||||
if new_name == "token_embd.weight":
|
||||
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
|
@ -2090,8 +2066,7 @@ class Phi3MiniModel(Model):
|
|||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
raise ValueError(f'Error: Missing {tokenizer_path}')
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
|
||||
|
@ -2129,7 +2104,7 @@ class Phi3MiniModel(Model):
|
|||
for key in added_tokens_json:
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
tokens[token_id] = key.encode("utf-8")
|
||||
|
@ -2211,8 +2186,7 @@ class PlamoModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
# shuffle for broadcasting of gqa in ggml_mul_mat
|
||||
if new_name.endswith("attn_q.weight"):
|
||||
|
@ -2243,7 +2217,7 @@ class PlamoModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -2289,8 +2263,7 @@ class CodeShellModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -2307,13 +2280,13 @@ class CodeShellModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
if not has_lm_head and name == "transformer.wte.weight":
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
|
||||
@Model.register("InternLM2ForCausalLM")
|
||||
|
@ -2335,7 +2308,7 @@ class InternLM2Model(Model):
|
|||
toktypes: list[int] = []
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
logger.error(f'Error: Missing {tokenizer_path}')
|
||||
sys.exit(1)
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
|
@ -2352,7 +2325,7 @@ class InternLM2Model(Model):
|
|||
if text == b"\x00":
|
||||
# (TODO): fixme
|
||||
# Hack here and replace the \x00 characters.
|
||||
print(f"InternLM2 convert token '{text}' to '🐉'!")
|
||||
logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
|
||||
text = "🐉"
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
|
@ -2393,7 +2366,7 @@ class InternLM2Model(Model):
|
|||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
|
||||
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
in chat mode so that the conversation can end normally.")
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
@ -2438,8 +2411,7 @@ in chat mode so that the conversation can end normally.")
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -2456,7 +2428,7 @@ in chat mode so that the conversation can end normally.")
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def write_tensors(self):
|
||||
|
@ -2567,8 +2539,7 @@ class BertModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
|
@ -2588,7 +2559,7 @@ class BertModel(Model):
|
|||
# if f32 desired, convert any float16 to float32
|
||||
new_dtype = np.float32
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
|
||||
|
||||
if data.dtype != new_dtype:
|
||||
data = data.astype(new_dtype)
|
||||
|
@ -2667,7 +2638,7 @@ class GemmaModel(Model):
|
|||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if name == "lm_head.weight":
|
||||
print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
@ -2684,8 +2655,7 @@ class GemmaModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -2696,7 +2666,7 @@ class GemmaModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -2724,7 +2694,7 @@ class MambaModel(Model):
|
|||
else:
|
||||
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
||||
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
|
||||
print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
||||
|
@ -2796,17 +2766,16 @@ class MambaModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
print("A_log --> A ==> " + new_name)
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
|
||||
# assuming token_embd.weight is seen before output.weight
|
||||
if tok_embd is not None and new_name == output_name:
|
||||
if torch.equal(tok_embd, data_torch):
|
||||
print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
|
||||
logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
|
||||
continue
|
||||
if new_name == tok_embd_name:
|
||||
tok_embd = data_torch
|
||||
|
@ -2829,7 +2798,7 @@ class MambaModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -2888,8 +2857,7 @@ class OlmoModel(Model):
|
|||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
@ -2906,7 +2874,7 @@ class OlmoModel(Model):
|
|||
if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
@ -2939,6 +2907,7 @@ def parse_args() -> argparse.Namespace:
|
|||
)
|
||||
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
|
||||
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
@ -2946,6 +2915,8 @@ def parse_args() -> argparse.Namespace:
|
|||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
if args.awq_path:
|
||||
|
@ -2954,15 +2925,15 @@ def main() -> None:
|
|||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
logger.info(f"{tmp_model_path} exists as a weighted model.")
|
||||
else:
|
||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
||||
print("Saving new weighted model ...")
|
||||
logger.info("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
logger.info(f"Saved weighted model at {tmp_model_path}.")
|
||||
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map = {
|
||||
|
@ -2976,7 +2947,7 @@ def main() -> None:
|
|||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
||||
|
||||
print(f"Loading model: {dir_model.name}")
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
|
@ -2984,20 +2955,20 @@ def main() -> None:
|
|||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
|
||||
|
||||
print("Set model parameters")
|
||||
logger.info("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
|
||||
print("Set model tokenizer")
|
||||
logger.info("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
if args.vocab_only:
|
||||
print(f"Exporting model vocab to '{fname_out}'")
|
||||
logger.info(f"Exporting model vocab to '{fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
print(f"Exporting model to '{fname_out}'")
|
||||
logger.info(f"Exporting model to '{fname_out}'")
|
||||
model_instance.write()
|
||||
|
||||
print(f"Model successfully exported to '{fname_out}'")
|
||||
logger.info(f"Model successfully exported to '{fname_out}'")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
|
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
logger = logging.getLogger("ggml-to-gguf")
|
||||
|
||||
|
||||
class GGMLFormat(IntEnum):
|
||||
GGML = 0
|
||||
|
@ -125,7 +128,6 @@ class Tensor:
|
|||
self.start_offset = offset
|
||||
self.len_bytes = n_bytes
|
||||
offset += n_bytes
|
||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||
return offset - orig_offset
|
||||
|
||||
|
||||
|
@ -175,7 +177,7 @@ class GGMLModel:
|
|||
offset += self.validate_header(data, offset)
|
||||
hp = Hyperparameters()
|
||||
offset += hp.load(data, offset)
|
||||
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
||||
logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
||||
self.validate_conversion(hp.ftype)
|
||||
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
|
||||
offset += vocab.load(data, offset, hp.n_vocab)
|
||||
|
@ -215,12 +217,12 @@ class GGMLToGGUF:
|
|||
if float(hp.n_head) / float(x) == gqa:
|
||||
n_kv_head = x
|
||||
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
||||
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
||||
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
||||
self.n_kv_head = n_kv_head
|
||||
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
||||
|
||||
def save(self):
|
||||
print('* Preparing to save GGUF file')
|
||||
logger.info('* Preparing to save GGUF file')
|
||||
gguf_writer = gguf.GGUFWriter(
|
||||
self.cfg.output,
|
||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||
|
@ -230,11 +232,11 @@ class GGMLToGGUF:
|
|||
if self.special_vocab is not None:
|
||||
self.special_vocab.add_to_gguf(gguf_writer)
|
||||
self.add_tensors(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
logger.info(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print(" gguf: write metadata")
|
||||
logger.info(" gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print(" gguf: write tensors")
|
||||
logger.info(" gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
gguf_writer.close()
|
||||
|
||||
|
@ -250,7 +252,7 @@ class GGMLToGGUF:
|
|||
name = cfg.name if cfg.name is not None else cfg.input.name
|
||||
except UnicodeDecodeError:
|
||||
name = None
|
||||
print('* Adding model parameters and KV items')
|
||||
logger.info('* Adding model parameters and KV items')
|
||||
if name is not None:
|
||||
gguf_writer.add_name(name)
|
||||
gguf_writer.add_description(desc)
|
||||
|
@ -287,7 +289,7 @@ class GGMLToGGUF:
|
|||
toktypes = []
|
||||
if self.vocab_override is not None:
|
||||
vo = self.vocab_override
|
||||
print('* Adding vocab item(s)')
|
||||
logger.info('* Adding vocab item(s)')
|
||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||
tokens.append(vbytes)
|
||||
scores.append(score)
|
||||
|
@ -299,7 +301,7 @@ class GGMLToGGUF:
|
|||
if len(toktypes) > 0:
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
return
|
||||
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||
logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
||||
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||
tt = 1 # Normal
|
||||
|
@ -334,7 +336,7 @@ class GGMLToGGUF:
|
|||
def add_tensors(self, gguf_writer):
|
||||
tensor_map = self.name_map
|
||||
data = self.data
|
||||
print(f'* Adding {len(self.model.tensors)} tensor(s)')
|
||||
logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
|
||||
for tensor in self.model.tensors:
|
||||
name = str(tensor.name, 'UTF-8')
|
||||
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
|
@ -344,7 +346,6 @@ class GGMLToGGUF:
|
|||
temp = tempdims[1]
|
||||
tempdims[1] = tempdims[0]
|
||||
tempdims[0] = temp
|
||||
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
||||
gguf_writer.add_tensor(
|
||||
mapped_name,
|
||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||
|
@ -401,33 +402,35 @@ def handle_args():
|
|||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||
parser.add_argument("--vocabtype", default="spm,hfft",
|
||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
print(f'* Using config: {cfg}')
|
||||
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
||||
logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
|
||||
logger.info(f'* Using config: {cfg}')
|
||||
logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
|
||||
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
|
||||
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
||||
logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
model = GGMLModel()
|
||||
print('* Scanning GGML input file')
|
||||
logger.info('* Scanning GGML input file')
|
||||
offset = model.load(data, 0) # noqa
|
||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||
logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||
vocab_override = None
|
||||
params_override = None
|
||||
special_vocab = None
|
||||
if cfg.model_metadata_dir is not None:
|
||||
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
|
||||
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
||||
print(f'* Overriding params: {params_override}')
|
||||
print(f'* Overriding vocab: {vocab_override}')
|
||||
print(f'* Special vocab: {special_vocab}')
|
||||
logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
||||
logger.info(f'* Overriding params: {params_override}')
|
||||
logger.info(f'* Overriding vocab: {vocab_override}')
|
||||
logger.info(f'* Special vocab: {special_vocab}')
|
||||
else:
|
||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||
logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||
if model.file_format == GGMLFormat.GGML:
|
||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||
logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||
converter = GGMLToGGUF(
|
||||
model, data, cfg,
|
||||
params_override = params_override,
|
||||
|
@ -435,7 +438,7 @@ def main():
|
|||
special_vocab = special_vocab
|
||||
)
|
||||
converter.save()
|
||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||
logger.info(f'* Successful completion. Output saved to: {cfg.output}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
|
@ -15,6 +16,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
|
@ -48,11 +51,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
|
|||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
|
||||
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
|
@ -70,7 +71,7 @@ if __name__ == '__main__':
|
|||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
logger.error(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
|
@ -80,21 +81,21 @@ if __name__ == '__main__':
|
|||
params = json.load(f)
|
||||
|
||||
if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] is True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
logger.error("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
print("Error: param bias is not supported")
|
||||
logger.error("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
print("Error: param modules_to_save is not supported")
|
||||
logger.error("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
with open(output_path, "wb") as fout:
|
||||
|
@ -125,13 +126,13 @@ if __name__ == '__main__':
|
|||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
logger.error(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
logger.error(f"Error: could not map tensor name {orig_k}")
|
||||
logger.error(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
|
@ -141,8 +142,8 @@ if __name__ == '__main__':
|
|||
else:
|
||||
assert False
|
||||
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
print(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
logger.info(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
logger = logging.getLogger("persimmon-to-gguf")
|
||||
|
||||
|
||||
def _flatten_dict(dct, tensors, prefix=None):
|
||||
assert isinstance(dct, dict)
|
||||
|
@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None):
|
|||
|
||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||
logger.info('getting sentencepiece tokenizer from', tokenizer_path)
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
print('gguf: adding tokens')
|
||||
logger.info('adding tokens')
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
@ -67,8 +70,10 @@ def main():
|
|||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
|
||||
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
|
||||
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
||||
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
sys.path.append(str(args.adept_inference_dir))
|
||||
persimmon_model = torch.load(args.ckpt_path)
|
||||
hparams = persimmon_model['args']
|
||||
|
@ -107,7 +112,7 @@ def main():
|
|||
gguf_writer.add_eos_token_id(71013)
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
||||
print(tensor_map)
|
||||
logger.info(tensor_map)
|
||||
for name in tensors.keys():
|
||||
data_torch = tensors[name]
|
||||
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||
|
@ -117,22 +122,21 @@ def main():
|
|||
data = data_torch.to(torch.float32).squeeze().numpy()
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
raise ValueError(f"Can not map tensor '{name}'")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
print("gguf: write header")
|
||||
logger.info("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
logger.info("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
logger.info("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{args.outfile}'")
|
||||
print("")
|
||||
logger.info(f"gguf: model successfully exported to '{args.outfile}'")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
60
convert.py
60
convert.py
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import enum
|
||||
|
@ -35,6 +36,8 @@ import gguf
|
|||
if TYPE_CHECKING:
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
logger = logging.getLogger("convert")
|
||||
|
||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||
faulthandler.register(signal.SIGUSR1)
|
||||
|
||||
|
@ -643,7 +646,6 @@ class LlamaHfVocab(Vocab):
|
|||
|
||||
|
||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
|
@ -1033,12 +1035,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
|
|||
|
||||
# Check for a vocab size mismatch
|
||||
if params.n_vocab == vocab.vocab_size:
|
||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||
logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||
return
|
||||
|
||||
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||
pad_count = params.n_vocab - vocab.vocab_size
|
||||
print(
|
||||
logger.debug(
|
||||
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
|
||||
)
|
||||
for i in range(1, pad_count + 1):
|
||||
|
@ -1166,7 +1168,7 @@ class OutputFile:
|
|||
elapsed = time.time() - start
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(
|
||||
logger.info(
|
||||
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||
)
|
||||
self.gguf.write_tensor_data(ndarray)
|
||||
|
@ -1281,12 +1283,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
|
|||
# HF models permut or pack some of the tensors, so we need to undo that
|
||||
for i in itertools.count():
|
||||
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
||||
print(f"Permuting layer {i}")
|
||||
logger.debug(f"Permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
logger.debug(f"Unpacking and permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
|
||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
||||
|
@ -1299,15 +1301,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
|
|||
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
|
||||
if name_new is None:
|
||||
if skip_unknown:
|
||||
print(f"Unexpected tensor name: {name} - skipping")
|
||||
logger.warning(f"Unexpected tensor name: {name} - skipping")
|
||||
continue
|
||||
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
||||
|
||||
if tensor_type in should_skip:
|
||||
print(f"skipping tensor {name_new}")
|
||||
logger.debug(f"skipping tensor {name_new}")
|
||||
continue
|
||||
|
||||
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
|
||||
logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
|
||||
out[name_new] = lazy_tensor
|
||||
|
||||
return out
|
||||
|
@ -1372,7 +1374,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||
paths = find_multifile_paths(path)
|
||||
models_plus: list[ModelPlus] = []
|
||||
for path in paths:
|
||||
print(f"Loading model file {path}")
|
||||
logger.info(f"Loading model file {path}")
|
||||
models_plus.append(lazy_load_file(path))
|
||||
|
||||
model_plus = merge_multifile_models(models_plus)
|
||||
|
@ -1413,7 +1415,7 @@ class VocabFactory:
|
|||
else:
|
||||
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
|
||||
|
||||
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
||||
logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
||||
return vocab
|
||||
|
||||
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
||||
|
@ -1438,19 +1440,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
|||
}[file_type]
|
||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||
if ret in model_paths:
|
||||
sys.stderr.write(
|
||||
logger.error(
|
||||
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||
"Please explicitly specify a path using --outfile.\n")
|
||||
"Please explicitly specify a path using --outfile.")
|
||||
sys.exit(1)
|
||||
return ret
|
||||
|
||||
|
||||
def do_dump_model(model_plus: ModelPlus) -> None:
|
||||
print(f"model_plus.paths = {model_plus.paths!r}")
|
||||
print(f"model_plus.format = {model_plus.format!r}")
|
||||
print(f"model_plus.vocab = {model_plus.vocab!r}")
|
||||
print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
|
||||
print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
|
||||
print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
|
||||
for name, lazy_tensor in model_plus.model.items():
|
||||
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
|
||||
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
|
||||
|
||||
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
|
@ -1473,8 +1475,18 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
|
||||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
elif args.dump_single or args.dump:
|
||||
# Avoid printing anything besides the dump output
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if args.no_vocab and args.vocab_only:
|
||||
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||
|
||||
|
@ -1491,6 +1503,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.big_endian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
|
@ -1513,7 +1526,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
"q8_0": GGMLFileType.MostlyQ8_0,
|
||||
}[args.outtype]
|
||||
|
||||
print(f"params = {params}")
|
||||
logger.info(f"params = {params}")
|
||||
|
||||
model_parent_path = model_plus.paths[0].parent
|
||||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
||||
|
@ -1528,15 +1541,14 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
logger.info(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
|
||||
vocab = model_plus.vocab
|
||||
|
||||
print(f"Vocab info: {vocab}")
|
||||
print(f"Special vocab info: {special_vocab}")
|
||||
|
||||
logger.info(f"Vocab info: {vocab}")
|
||||
logger.info(f"Special vocab info: {special_vocab}")
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params, args.skip_unknown)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
|
@ -1544,11 +1556,11 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
logger.info(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
logger.info(f"Wrote {outfile}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -544,7 +544,7 @@ int main(int argc, char ** argv) {
|
|||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
|
|
|
@ -7,44 +7,16 @@ Feature: Results
|
|||
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-00001-of-00003.gguf
|
||||
And 128 as batch size
|
||||
And 256 KV cache size
|
||||
And 1024 KV cache size
|
||||
And 128 max tokens to predict
|
||||
|
||||
Scenario Outline: Multi users completion
|
||||
Given <n_slots> slots
|
||||
And continuous batching
|
||||
|
||||
Scenario Outline: consistent results with same seed
|
||||
Given <n_slots> slots
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
|
||||
Given 42 as seed
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
|
||||
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
|
@ -55,3 +27,55 @@ Feature: Results
|
|||
| n_slots |
|
||||
| 1 |
|
||||
| 2 |
|
||||
|
||||
Scenario Outline: different results with different seed
|
||||
Given <n_slots> slots
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
|
||||
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
|
||||
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all predictions are different
|
||||
Examples:
|
||||
| n_slots |
|
||||
| 1 |
|
||||
| 2 |
|
||||
|
||||
Scenario Outline: consistent results with same seed and varying batch size
|
||||
Given 4 slots
|
||||
And <temp> temperature
|
||||
# And 0 as draft
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "Write a very long story about AI." with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Then all predictions are equal
|
||||
Examples:
|
||||
| n_parallel | temp |
|
||||
| 1 | 0.0 |
|
||||
| 2 | 0.0 |
|
||||
| 4 | 0.0 |
|
||||
| 1 | 1.0 |
|
||||
# FIXME: These tests fail on master. The problem seems to be the unified KV cache.
|
||||
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
|
||||
# | 2 | 1.0 |
|
||||
# | 4 | 1.0 |
|
||||
|
|
|
@ -65,6 +65,7 @@ def step_server_config(context, server_fqdn, server_port):
|
|||
context.server_seed = None
|
||||
context.user_api_key = None
|
||||
context.response_format = None
|
||||
context.temperature = None
|
||||
|
||||
context.tasks_result = []
|
||||
context.concurrent_tasks = []
|
||||
|
@ -232,15 +233,17 @@ async def step_all_slots_status(context, expected_slot_status_string):
|
|||
@async_run_until_complete
|
||||
async def step_request_completion(context, api_error):
|
||||
expect_api_error = api_error == 'raised'
|
||||
seeds = await completions_seed(context, num_seeds=1)
|
||||
completion = await request_completion(context.prompts.pop(),
|
||||
seeds[0] if seeds is not None else seeds,
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict,
|
||||
cache_prompt=context.cache_prompt,
|
||||
id_slot=context.id_slot,
|
||||
seed=await completions_seed(context),
|
||||
expect_api_error=expect_api_error,
|
||||
user_api_key=context.user_api_key)
|
||||
user_api_key=context.user_api_key,
|
||||
temperature=context.temperature)
|
||||
context.tasks_result.append(completion)
|
||||
if context.debug:
|
||||
print(f"Completion response: {completion}")
|
||||
|
@ -269,6 +272,15 @@ async def step_predictions_equal(context):
|
|||
context.tasks_result = []
|
||||
|
||||
|
||||
@step('all predictions are different')
|
||||
@async_run_until_complete
|
||||
async def step_predictions_equal(context):
|
||||
n_completions = await gather_tasks_results(context)
|
||||
assert n_completions >= 2, "need at least 2 completions"
|
||||
assert_all_predictions_different(context.tasks_result)
|
||||
context.tasks_result = []
|
||||
|
||||
|
||||
@step('the completion is truncated')
|
||||
def step_assert_completion_truncated(context):
|
||||
step_assert_completion_truncated(context, '')
|
||||
|
@ -311,6 +323,11 @@ def step_response_format(context, response_format):
|
|||
context.response_format = json.loads(response_format)
|
||||
|
||||
|
||||
@step('{temperature:f} temperature')
|
||||
def step_temperature(context, temperature):
|
||||
context.temperature = temperature
|
||||
|
||||
|
||||
@step('streaming is {enable_streaming}')
|
||||
def step_streaming(context, enable_streaming):
|
||||
context.enable_streaming = enable_streaming == 'enabled'
|
||||
|
@ -353,7 +370,10 @@ def step_n_ubatch(context, n_ubatch):
|
|||
|
||||
@step('{seed:d} as seed')
|
||||
def step_seed(context, seed):
|
||||
context.seed = seed
|
||||
if context.seed is None:
|
||||
context.seed = [seed]
|
||||
else:
|
||||
context.seed.append(seed)
|
||||
|
||||
|
||||
@step('a prefix prompt')
|
||||
|
@ -413,7 +433,9 @@ async def step_oai_chat_completions(context, api_error):
|
|||
if context.debug:
|
||||
print(f"Submitting OAI compatible completions request...")
|
||||
expect_api_error = api_error == 'raised'
|
||||
seeds = await completions_seed(context, num_seeds=1),
|
||||
completion = await oai_chat_completions(context.prompts.pop(),
|
||||
seeds[0] if seeds is not None else seeds,
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
'/v1/chat',
|
||||
|
@ -429,8 +451,6 @@ async def step_oai_chat_completions(context, api_error):
|
|||
response_format=context.response_format
|
||||
if hasattr(context, 'response_format') else None,
|
||||
|
||||
seed=await completions_seed(context),
|
||||
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None,
|
||||
|
||||
|
@ -457,20 +477,31 @@ def step_a_prompt_prompt(context, prompt):
|
|||
context.n_prompts = len(context.prompts)
|
||||
|
||||
|
||||
@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
|
||||
def step_many_prompts(context, num_prompts, prompt, seed):
|
||||
if context.seed is None:
|
||||
context.seed = []
|
||||
for _ in range(num_prompts):
|
||||
context.seed.append(seed)
|
||||
context.prompts.append(prompt)
|
||||
context.n_prompts = len(context.prompts)
|
||||
|
||||
|
||||
@step('concurrent completion requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_completion_requests(context):
|
||||
await concurrent_requests(context,
|
||||
request_completion,
|
||||
# prompt is inserted automatically
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
prompt_prefix=context.prompt_prefix,
|
||||
prompt_suffix=context.prompt_suffix,
|
||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||
seed=await completions_seed(context),
|
||||
user_api_key=context.user_api_key if hasattr(context,
|
||||
'user_api_key') else None)
|
||||
await concurrent_requests(
|
||||
context,
|
||||
request_completion,
|
||||
# prompt is inserted automatically
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
prompt_prefix=context.prompt_prefix,
|
||||
prompt_suffix=context.prompt_suffix,
|
||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||
user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
|
||||
temperature=context.temperature,
|
||||
)
|
||||
|
||||
|
||||
@step('concurrent OAI completions requests')
|
||||
|
@ -490,7 +521,6 @@ async def step_oai_chat_completions(context):
|
|||
if hasattr(context, 'enable_streaming') else None,
|
||||
response_format=context.response_format
|
||||
if hasattr(context, 'response_format') else None,
|
||||
seed=await completions_seed(context),
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None)
|
||||
|
||||
|
@ -512,10 +542,6 @@ async def step_oai_chat_completions(context):
|
|||
if hasattr(context, 'enable_streaming') else None,
|
||||
response_format=context.response_format
|
||||
if hasattr(context, 'response_format') else None,
|
||||
seed=context.seed
|
||||
if hasattr(context, 'seed') else
|
||||
context.server_seed
|
||||
if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None)
|
||||
|
||||
|
@ -544,7 +570,7 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
|||
@async_run_until_complete
|
||||
async def step_compute_embedding(context):
|
||||
context.n_prompts = 1
|
||||
context.embeddings = await request_embedding(context_text(context), base_url=context.base_url)
|
||||
context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
|
||||
|
||||
|
||||
@step('all embeddings are the same')
|
||||
|
@ -585,7 +611,7 @@ def step_assert_embeddings(context):
|
|||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings(context):
|
||||
context.n_prompts = 1
|
||||
context.embeddings = await request_oai_embeddings(context_text(context),
|
||||
context.embeddings = await request_oai_embeddings(context_text(context), None,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
@ -594,7 +620,7 @@ async def step_oai_compute_embeddings(context):
|
|||
@step('an OAI compatible embeddings computation request for multiple inputs')
|
||||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||
context.embeddings = await request_oai_embeddings(context.prompts,
|
||||
context.embeddings = await request_oai_embeddings(context.prompts, None,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
@ -740,8 +766,9 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
|||
if context.debug:
|
||||
print(f"starting {context.n_prompts} concurrent completion requests...")
|
||||
assert context.n_prompts > 0
|
||||
seeds = await completions_seed(context)
|
||||
for prompt_no in range(context.n_prompts):
|
||||
shifted_args = [context.prompts.pop(), *args]
|
||||
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
|
||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
@ -781,6 +808,7 @@ def step_server_responds_with_status_code(context, status_code):
|
|||
|
||||
|
||||
async def request_completion(prompt,
|
||||
seed,
|
||||
base_url,
|
||||
debug=False,
|
||||
prompt_prefix=None,
|
||||
|
@ -788,9 +816,9 @@ async def request_completion(prompt,
|
|||
n_predict=None,
|
||||
cache_prompt=False,
|
||||
id_slot=None,
|
||||
seed=None,
|
||||
expect_api_error=None,
|
||||
user_api_key=None):
|
||||
user_api_key=None,
|
||||
temperature=None):
|
||||
if debug:
|
||||
print(f"Sending completion request: {prompt}")
|
||||
origin = "my.super.domain"
|
||||
|
@ -811,7 +839,8 @@ async def request_completion(prompt,
|
|||
"n_predict": n_predict if n_predict is not None else -1,
|
||||
"cache_prompt": cache_prompt,
|
||||
"id_slot": id_slot,
|
||||
"seed": seed if seed is not None else 42
|
||||
"seed": seed if seed is not None else 42,
|
||||
"temperature": temperature if temperature is not None else "0.8f",
|
||||
},
|
||||
headers=headers,
|
||||
timeout=3600) as response:
|
||||
|
@ -824,6 +853,7 @@ async def request_completion(prompt,
|
|||
|
||||
|
||||
async def oai_chat_completions(user_prompt,
|
||||
seed,
|
||||
system_prompt,
|
||||
base_url,
|
||||
base_path,
|
||||
|
@ -833,7 +863,6 @@ async def oai_chat_completions(user_prompt,
|
|||
n_predict=None,
|
||||
enable_streaming=None,
|
||||
response_format=None,
|
||||
seed=None,
|
||||
user_api_key=None,
|
||||
expect_api_error=None):
|
||||
if debug:
|
||||
|
@ -952,7 +981,7 @@ async def oai_chat_completions(user_prompt,
|
|||
return completion_response
|
||||
|
||||
|
||||
async def request_embedding(content, base_url=None):
|
||||
async def request_embedding(content, seed, base_url=None):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/embedding',
|
||||
json={
|
||||
|
@ -963,7 +992,7 @@ async def request_embedding(content, base_url=None):
|
|||
return [response_json['embedding']]
|
||||
|
||||
|
||||
async def request_oai_embeddings(input,
|
||||
async def request_oai_embeddings(input, seed,
|
||||
base_url=None, user_api_key=None,
|
||||
model=None, async_client=False):
|
||||
# openai client always expects an api_key
|
||||
|
@ -1036,21 +1065,31 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
|||
f' {n_predicted} <> {expected_predicted_n}')
|
||||
|
||||
def assert_all_predictions_equal(completion_responses):
|
||||
content_0 = completion_responses[0]['content']
|
||||
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
print(f"content 0: {content_0}")
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
content_i = response_i['content']
|
||||
print(f"content {i}: {content_i}")
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
content_i = response_i['content']
|
||||
for j, response_j in enumerate(completion_responses):
|
||||
if i == j:
|
||||
continue
|
||||
content_j = response_j['content']
|
||||
assert content_i == content_j, "contents not equal"
|
||||
|
||||
i = 1
|
||||
for response in completion_responses[1:]:
|
||||
content = response['content']
|
||||
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
print(f"content {i}: {content}")
|
||||
|
||||
assert content == content_0, "contents not equal"
|
||||
|
||||
i += 1
|
||||
def assert_all_predictions_different(completion_responses):
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
content_i = response_i['content']
|
||||
print(f"content {i}: {content_i}")
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
content_i = response_i['content']
|
||||
for j, response_j in enumerate(completion_responses):
|
||||
if i == j:
|
||||
continue
|
||||
content_j = response_j['content']
|
||||
assert content_i != content_j, "contents not different"
|
||||
|
||||
|
||||
async def gather_tasks_results(context):
|
||||
|
@ -1145,9 +1184,22 @@ def assert_slots_status(slots, expected_slots):
|
|||
f" = {expected[key]} != {slot[key]}")
|
||||
|
||||
|
||||
async def completions_seed(context):
|
||||
return context.seed if hasattr(context, 'seed') and context.seed is not None \
|
||||
else context.server_seed if hasattr(context, 'server_seed') else None
|
||||
async def completions_seed(context, num_seeds=None):
|
||||
if hasattr(context, "seed") and context.seed is not None:
|
||||
assert len(context.seed) == context.n_prompts
|
||||
if num_seeds is None:
|
||||
num_seeds = context.n_prompts
|
||||
assert num_seeds <= context.n_prompts
|
||||
seeds = context.seed[:num_seeds]
|
||||
context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
|
||||
return seeds
|
||||
|
||||
if hasattr(context, "server_seed") and context.server_seed is not None:
|
||||
if num_seeds is None:
|
||||
return [context.server_seed] * context.n_prompts
|
||||
else:
|
||||
return [context.server_seed] * num_seeds
|
||||
return None
|
||||
|
||||
|
||||
def context_text(context):
|
||||
|
|
|
@ -137,7 +137,8 @@
|
|||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
||||
|
||||
#define CC_PASCAL 600
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
|
@ -293,20 +294,54 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|||
return x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
#if CUDART_VERSION >= CUDART_HMAX
|
||||
return __hmax(a, b);
|
||||
#else
|
||||
return __half2float(a) > __half2float(b) ? a : b;
|
||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
||||
|
||||
#else
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(b);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
}
|
||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
#if CUDART_VERSION >= CUDART_HMAX
|
||||
return __hmax2(a, b);
|
||||
#else
|
||||
half2 ret;
|
||||
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
|
||||
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
|
||||
return ret;
|
||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
||||
|
||||
#else
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(b);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
}
|
||||
return x;
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < 12000
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
||||
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
||||
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
||||
|
|
|
@ -116,7 +116,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
sum2 = warp_reduce_sum(sum2);
|
||||
half sum = __low2half(sum2) + __high2half(sum2);
|
||||
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
kqmax_new = __hmax(kqmax_new, sum);
|
||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[i_KQ] = sum;
|
||||
}
|
||||
|
@ -416,9 +416,9 @@ static __global__ void flash_attn_ext_f16(
|
|||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
||||
KQ_max_new = __hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
|
||||
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
|
||||
}
|
||||
KQ_max_new = __half2half2(warp_reduce_max(__hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||
const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
|
||||
KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
|
||||
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from tempfile import gettempdir, NamedTemporaryFile
|
||||
|
||||
logger = logging.getLogger("ggml-vk-generate-shaders")
|
||||
|
||||
shader_f32 = """
|
||||
#define FLOAT_TYPE float
|
||||
"""
|
||||
|
@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True):
|
|||
|
||||
stdout, stderr = await proc.communicate()
|
||||
|
||||
print(" ".join(cmd))
|
||||
logger.info(" ".join(cmd))
|
||||
|
||||
if proc.returncode:
|
||||
raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}")
|
||||
|
@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True):
|
|||
|
||||
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
|
||||
code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
|
||||
print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}")
|
||||
logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}")
|
||||
f.close()
|
||||
os.remove(f.name)
|
||||
sys.exit(proc.returncode)
|
||||
|
@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True):
|
|||
|
||||
|
||||
async def main():
|
||||
print("ggml_vulkan: Generating and compiling shaders to SPIR-V")
|
||||
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
|
||||
|
||||
tasks = []
|
||||
|
||||
|
@ -2768,9 +2771,12 @@ if __name__ == "__main__":
|
|||
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
|
||||
|
||||
parser.add_argument("--glslc", help="Path to glslc")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if args.glslc:
|
||||
GLSLC = args.glslc
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path):
|
|||
reader = GGUFReader(gguf_file_path)
|
||||
|
||||
# List all key-value pairs in a columnized format
|
||||
print("Key-Value Pairs:")
|
||||
print("Key-Value Pairs:") # noqa: NP100
|
||||
max_key_length = max(len(key) for key in reader.fields.keys())
|
||||
for key, field in reader.fields.items():
|
||||
value = field.parts[field.data[0]]
|
||||
print(f"{key:{max_key_length}} : {value}")
|
||||
print("----")
|
||||
print(f"{key:{max_key_length}} : {value}") # noqa: NP100
|
||||
print("----") # noqa: NP100
|
||||
|
||||
# List all tensors
|
||||
print("Tensors:")
|
||||
print("Tensors:") # noqa: NP100
|
||||
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
|
||||
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
|
||||
print("-" * 80)
|
||||
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
|
||||
print("-" * 80) # noqa: NP100
|
||||
for tensor in reader.tensors:
|
||||
shape_str = "x".join(map(str, tensor.shape))
|
||||
size_str = str(tensor.n_elements)
|
||||
quantization_str = tensor.tensor_type.name
|
||||
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
|
||||
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: reader.py <path_to_gguf_file>")
|
||||
logger.info("Usage: reader.py <path_to_gguf_file>")
|
||||
sys.exit(1)
|
||||
gguf_file_path = sys.argv[1]
|
||||
read_gguf_file(gguf_file_path)
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from enum import Enum, IntEnum, auto
|
||||
from typing import Any
|
||||
|
||||
|
@ -854,8 +853,7 @@ class GGUFValueType(IntEnum):
|
|||
return GGUFValueType.INT32
|
||||
# TODO: need help with 64-bit types in Python
|
||||
else:
|
||||
print("Unknown type:", type(val))
|
||||
sys.exit()
|
||||
raise ValueError(f"Unknown type: {type(val)}")
|
||||
|
||||
|
||||
# Note: Does not support GGML_QKK_64
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||
|
@ -27,6 +28,7 @@ from gguf.constants import (
|
|||
GGUFValueType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
|
||||
|
||||
|
@ -142,7 +144,7 @@ class GGUFReader:
|
|||
# TODO: add option to generate error on duplicate keys
|
||||
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
|
||||
|
||||
print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
|
||||
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
|
||||
self.fields[field.name + '_{}'.format(field.offset)] = field
|
||||
else:
|
||||
self.fields[field.name] = field
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import struct
|
||||
|
@ -24,6 +25,8 @@ from .constants import (
|
|||
TokenType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WriterState(Enum):
|
||||
EMPTY = auto()
|
||||
|
@ -67,7 +70,7 @@ class GGUFWriter:
|
|||
self.use_temp_file = use_temp_file
|
||||
self.temp_file = None
|
||||
self.tensors = []
|
||||
print("gguf: This GGUF file is for {0} Endian only".format(
|
||||
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||
))
|
||||
self.state = WriterState.EMPTY
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
|
||||
from .gguf_writer import GGUFWriter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SpecialVocab:
|
||||
merges: list[str]
|
||||
|
@ -40,38 +42,29 @@ class SpecialVocab:
|
|||
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
|
||||
if self.merges:
|
||||
if not quiet:
|
||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||
logger.info(f'Adding {len(self.merges)} merge(s).')
|
||||
gw.add_token_merges(self.merges)
|
||||
elif self.load_merges:
|
||||
print(
|
||||
'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.warning('Adding merges requested but no merges found, output may be non-functional.')
|
||||
for typ, tokid in self.special_token_ids.items():
|
||||
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||
if id_handler is None:
|
||||
print(
|
||||
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
|
||||
continue
|
||||
if not quiet:
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
logger.info(f'Setting special token type {typ} to {tokid}')
|
||||
id_handler(tokid)
|
||||
for typ, value in self.add_special_token.items():
|
||||
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
|
||||
if add_handler is None:
|
||||
print(
|
||||
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
|
||||
continue
|
||||
if not quiet:
|
||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||
logger.info(f'Setting add_{typ}_token to {value}')
|
||||
add_handler(value)
|
||||
if self.chat_template is not None:
|
||||
if not quiet:
|
||||
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||
logger.info(f'Setting chat_template to {self.chat_template}')
|
||||
gw.add_chat_template(self.chat_template)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
|
@ -99,10 +92,7 @@ class SpecialVocab:
|
|||
continue
|
||||
parts = line.split(None, 3)
|
||||
if len(parts) != 2:
|
||||
print(
|
||||
f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
|
||||
continue
|
||||
merges.append(f'{parts[0]} {parts[1]}')
|
||||
self.merges = merges
|
||||
|
@ -118,10 +108,7 @@ class SpecialVocab:
|
|||
return
|
||||
self.special_token_ids[typ] = tid
|
||||
return
|
||||
print(
|
||||
f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
||||
|
||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
|
@ -144,10 +131,7 @@ class SpecialVocab:
|
|||
if chat_template is None or isinstance(chat_template, (str, list)):
|
||||
self.chat_template = chat_template
|
||||
else:
|
||||
print(
|
||||
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
|
||||
file = sys.stderr
|
||||
)
|
||||
logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
|
||||
for typ in self.special_token_types:
|
||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||
if isinstance(add_entry, bool):
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
|
|||
|
||||
import gguf
|
||||
|
||||
logger = logging.getLogger("gguf-convert-endian")
|
||||
|
||||
|
||||
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
|
||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||
|
@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
|
|||
else:
|
||||
file_endian = host_endian
|
||||
order = host_endian if args.order == "native" else args.order
|
||||
print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
|
||||
logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
|
||||
if file_endian == order:
|
||||
print(f"* File is already {order.upper()} endian. Nothing to do.")
|
||||
logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
|
||||
sys.exit(0)
|
||||
print("* Checking tensors for conversion compatibility")
|
||||
logger.info("* Checking tensors for conversion compatibility")
|
||||
for tensor in reader.tensors:
|
||||
if tensor.tensor_type not in (
|
||||
gguf.GGMLQuantizationType.F32,
|
||||
|
@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
|
|||
gguf.GGMLQuantizationType.Q8_0,
|
||||
):
|
||||
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
|
||||
print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
|
||||
logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
|
||||
if args.dry_run:
|
||||
return
|
||||
print("\n*** Warning *** Warning *** Warning **")
|
||||
print("* This conversion process may damage the file. Ensure you have a backup.")
|
||||
logger.warning("*** Warning *** Warning *** Warning **")
|
||||
logger.warning("* This conversion process may damage the file. Ensure you have a backup.")
|
||||
if order != host_endian:
|
||||
print("* Requested endian differs from host, you will not be able to load the model on this machine.")
|
||||
print("* The file will be modified immediately, so if conversion fails or is interrupted")
|
||||
print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
|
||||
logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.")
|
||||
logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted")
|
||||
logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
|
||||
response = input("YES, I am sure> ")
|
||||
if response != "YES":
|
||||
print("You didn't enter YES. Okay then, see ya!")
|
||||
logger.warning("You didn't enter YES. Okay then, see ya!")
|
||||
sys.exit(0)
|
||||
print(f"\n* Converting fields ({len(reader.fields)})")
|
||||
logger.info(f"* Converting fields ({len(reader.fields)})")
|
||||
for idx, field in enumerate(reader.fields.values()):
|
||||
print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
|
||||
logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
|
||||
for part in field.parts:
|
||||
part.byteswap(inplace=True)
|
||||
print(f"\n* Converting tensors ({len(reader.tensors)})")
|
||||
for idx, tensor in enumerate(reader.tensors):
|
||||
print(
|
||||
f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, "
|
||||
f"elements={tensor.n_elements}... ",
|
||||
end="",
|
||||
logger.info(f"* Converting tensors ({len(reader.tensors)})")
|
||||
|
||||
for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")):
|
||||
log_message = (
|
||||
f"Converting tensor {repr(tensor.name)}, "
|
||||
f"type={tensor.tensor_type.name}, "
|
||||
f"elements={tensor.n_elements} "
|
||||
)
|
||||
tensor_type = tensor.tensor_type
|
||||
|
||||
# Byte-swap each part of the tensor's field
|
||||
for part in tensor.field.parts:
|
||||
part.byteswap(inplace=True)
|
||||
if tensor_type != gguf.GGMLQuantizationType.Q8_0:
|
||||
|
||||
# Byte-swap tensor data if necessary
|
||||
if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
|
||||
# Handle Q8_0 tensor blocks (block_q8_0)
|
||||
# Specific handling of block_q8_0 is required.
|
||||
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
|
||||
|
||||
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
|
||||
|
||||
n_blocks = len(tensor.data) // block_size
|
||||
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
|
||||
block_offs = block_num * block_size
|
||||
|
||||
# Byte-Swap f16 sized delta field
|
||||
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
||||
delta.byteswap(inplace=True)
|
||||
|
||||
# Byte-Swap Q8 weights
|
||||
if block_num % 100000 == 0:
|
||||
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
|
||||
|
||||
else:
|
||||
# Handle other tensor types
|
||||
tensor.data.byteswap(inplace=True)
|
||||
print()
|
||||
continue
|
||||
# A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes
|
||||
block_size = 34
|
||||
n_blocks = len(tensor.data) // block_size
|
||||
for block_num in range(n_blocks):
|
||||
block_offs = block_num * block_size
|
||||
# I know I said f16, but it doesn't matter here - any simple 16 bit type works.
|
||||
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
||||
delta.byteswap(inplace=True)
|
||||
if block_num % 100000 == 0:
|
||||
print(f"[{(n_blocks - block_num) // 1000}K]", end="")
|
||||
sys.stdout.flush()
|
||||
print()
|
||||
print("* Completion")
|
||||
|
||||
pbar.set_description(log_message)
|
||||
|
||||
logger.info("* Completion")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
@ -102,8 +119,13 @@ def main() -> None:
|
|||
"--dry-run", action="store_true",
|
||||
help="Don't actually change anything",
|
||||
)
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
|
||||
print(f'* Loading: {args.model}')
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
logger.info(f'* Loading: {args.model}')
|
||||
reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
|
||||
convert_byteorder(reader, args)
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
|
|||
|
||||
from gguf import GGUFReader, GGUFValueType # noqa: E402
|
||||
|
||||
logger = logging.getLogger("gguf-dump")
|
||||
|
||||
|
||||
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
|
||||
host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
|
||||
|
@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
|
|||
# please see the comments in the modify_gguf.py example.
|
||||
def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
|
||||
host_endian, file_endian = get_file_host_endian(reader)
|
||||
print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')
|
||||
print(f'\n* Dumping {len(reader.fields)} key/value pair(s)')
|
||||
print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100
|
||||
print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100
|
||||
for n, field in enumerate(reader.fields.values(), 1):
|
||||
if not field.types:
|
||||
pretty_type = 'N/A'
|
||||
|
@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
|
|||
pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
|
||||
else:
|
||||
pretty_type = str(field.types[-1].name)
|
||||
print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
|
||||
|
||||
log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
|
||||
if len(field.types) == 1:
|
||||
curr_type = field.types[0]
|
||||
if curr_type == GGUFValueType.STRING:
|
||||
print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
|
||||
log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60]))
|
||||
elif field.types[0] in reader.gguf_scalar_to_np:
|
||||
print(' = {0}'.format(field.parts[-1][0]), end = '')
|
||||
print()
|
||||
log_message += ' = {0}'.format(field.parts[-1][0])
|
||||
print(log_message) # noqa: NP100
|
||||
if args.no_tensors:
|
||||
return
|
||||
print(f'\n* Dumping {len(reader.tensors)} tensor(s)')
|
||||
print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100
|
||||
for n, tensor in enumerate(reader.tensors, 1):
|
||||
prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
|
||||
print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')
|
||||
print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100
|
||||
|
||||
|
||||
def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
|
||||
|
@ -103,10 +107,17 @@ def main() -> None:
|
|||
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
||||
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
||||
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if not args.json:
|
||||
print(f'* Loading: {args.model}')
|
||||
logger.info(f'* Loading: {args.model}')
|
||||
|
||||
reader = GGUFReader(args.model, 'r')
|
||||
|
||||
if args.json:
|
||||
dump_metadata_json(reader, args)
|
||||
else:
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
|
|||
|
||||
from gguf import GGUFReader # noqa: E402
|
||||
|
||||
logger = logging.getLogger("gguf-set-metadata")
|
||||
|
||||
|
||||
def minimal_example(filename: str) -> None:
|
||||
reader = GGUFReader(filename, 'r+')
|
||||
|
@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None:
|
|||
def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
|
||||
field = reader.get_field(args.key)
|
||||
if field is None:
|
||||
print(f'! Field {repr(args.key)} not found', file = sys.stderr)
|
||||
logger.error(f'! Field {repr(args.key)} not found')
|
||||
sys.exit(1)
|
||||
# Note that field.types is a list of types. This is because the GGUF
|
||||
# format supports arrays. For example, an array of UINT32 would
|
||||
# look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
|
||||
handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
|
||||
if handler is None:
|
||||
print(
|
||||
f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
|
||||
file = sys.stderr,
|
||||
)
|
||||
logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}')
|
||||
sys.exit(1)
|
||||
current_value = field.parts[field.data[0]][0]
|
||||
new_value = handler(args.value)
|
||||
print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
|
||||
logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
|
||||
if current_value == new_value:
|
||||
print(f'- Key {repr(args.key)} already set to requested value {current_value}')
|
||||
logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}')
|
||||
sys.exit(0)
|
||||
if args.dry_run:
|
||||
sys.exit(0)
|
||||
if not args.force:
|
||||
print('*** Warning *** Warning *** Warning **')
|
||||
print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
|
||||
print('* Enter exactly YES if you are positive you want to proceed:')
|
||||
logger.warning('*** Warning *** Warning *** Warning **')
|
||||
logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
|
||||
logger.warning('* Enter exactly YES if you are positive you want to proceed:')
|
||||
response = input('YES, I am sure> ')
|
||||
if response != 'YES':
|
||||
print("You didn't enter YES. Okay then, see ya!")
|
||||
logger.info("You didn't enter YES. Okay then, see ya!")
|
||||
sys.exit(0)
|
||||
field.parts[field.data[0]][0] = new_value
|
||||
print('* Field changed. Successful completion.')
|
||||
logger.info('* Field changed. Successful completion.')
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
@ -80,8 +80,13 @@ def main() -> None:
|
|||
parser.add_argument("value", type=str, help="Metadata value to set")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
|
||||
parser.add_argument("--force", action="store_true", help="Change the field without confirmation")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
|
||||
print(f'* Loading: {args.model}')
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
logger.info(f'* Loading: {args.model}')
|
||||
reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
|
||||
set_metadata(reader, args)
|
||||
|
||||
|
|
|
@ -2359,7 +2359,7 @@ static bool llama_kv_cache_init(
|
|||
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
||||
cache.v_trans = !cparams.flash_attn;
|
||||
|
||||
// TODO: support mixed reccurent Transformer architectues
|
||||
// TODO: support mixed recurrent Transformer architectures
|
||||
// NOTE: (!a || b) is a logical implication (a -> b)
|
||||
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
||||
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
||||
|
|
2
llama.h
2
llama.h
|
@ -172,7 +172,7 @@ extern "C" {
|
|||
bool sorted;
|
||||
} llama_token_data_array;
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
|
||||
// Input data for llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import heapq
|
||||
import sys
|
||||
|
@ -11,9 +12,11 @@ try:
|
|||
import git
|
||||
from tabulate import tabulate
|
||||
except ImportError as e:
|
||||
print("ERROR: the following Python libraries are required: GitPython, tabulate.")
|
||||
print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
|
||||
raise e
|
||||
|
||||
logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
|
||||
|
@ -94,8 +97,7 @@ parser.add_argument("-s", "--show", help=help_s)
|
|||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
if unknown_args:
|
||||
print(f"ERROR: Received unknown args: {unknown_args}.")
|
||||
print()
|
||||
logger.error(f"Received unknown args: {unknown_args}.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -108,8 +110,7 @@ if input_file is None:
|
|||
input_file = sqlite_files[0]
|
||||
|
||||
if input_file is None:
|
||||
print("ERROR: Cannot find a suitable input file, please provide one.")
|
||||
print()
|
||||
logger.error("Cannot find a suitable input file, please provide one.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -194,23 +195,19 @@ if known_args.baseline is not None:
|
|||
hexsha8_baseline = get_commit_hexsha8(known_args.baseline)
|
||||
name_baseline = known_args.baseline
|
||||
if hexsha8_baseline is None:
|
||||
print(f"ERROR: cannot find data for baseline={known_args.baseline}.")
|
||||
logger.error(f"cannot find data for baseline={known_args.baseline}.")
|
||||
sys.exit(1)
|
||||
# Otherwise, search for the most recent parent of master for which there is data:
|
||||
elif repo is not None:
|
||||
hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
|
||||
|
||||
if hexsha8_baseline is None:
|
||||
print("ERROR: No baseline was provided and did not find data for any master branch commits.")
|
||||
print()
|
||||
logger.error("No baseline was provided and did not find data for any master branch commits.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(
|
||||
"ERROR: No baseline was provided and the current working directory "
|
||||
"is not part of a git repository from which a baseline could be inferred."
|
||||
)
|
||||
print()
|
||||
logger.error("No baseline was provided and the current working directory "
|
||||
"is not part of a git repository from which a baseline could be inferred.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -227,7 +224,7 @@ if known_args.compare is not None:
|
|||
hexsha8_compare = get_commit_hexsha8(known_args.compare)
|
||||
name_compare = known_args.compare
|
||||
if hexsha8_compare is None:
|
||||
print(f"ERROR: cannot find data for compare={known_args.compare}.")
|
||||
logger.error(f"cannot find data for compare={known_args.compare}.")
|
||||
sys.exit(1)
|
||||
# Otherwise, search for the commit for llama-bench was most recently run
|
||||
# and that is not a parent of master:
|
||||
|
@ -241,16 +238,12 @@ elif repo is not None:
|
|||
break
|
||||
|
||||
if hexsha8_compare is None:
|
||||
print("ERROR: No compare target was provided and did not find data for any non-master commits.")
|
||||
print()
|
||||
logger.error("No compare target was provided and did not find data for any non-master commits.")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(
|
||||
"ERROR: No compare target was provided and the current working directory "
|
||||
"is not part of a git repository from which a compare target could be inferred."
|
||||
)
|
||||
print()
|
||||
logger.error("No compare target was provided and the current working directory "
|
||||
"is not part of a git repository from which a compare target could be inferred.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -284,8 +277,7 @@ if known_args.show is not None:
|
|||
if prop not in KEY_PROPERTIES[:-2]: # Last two values are n_prompt, n_gen.
|
||||
unknown_cols.append(prop)
|
||||
if unknown_cols:
|
||||
print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}")
|
||||
print()
|
||||
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
||||
parser.print_usage()
|
||||
sys.exit(1)
|
||||
rows_show = get_rows(show)
|
||||
|
@ -369,7 +361,7 @@ if "gpu_info" in show:
|
|||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
|
||||
print(tabulate(
|
||||
logger.info(tabulate(
|
||||
table,
|
||||
headers=headers,
|
||||
floatfmt=".2f",
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
@ -7,6 +8,8 @@ import sys
|
|||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger("run-with-preset")
|
||||
|
||||
CLI_ARGS_MAIN_PERPLEXITY = [
|
||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||
|
@ -56,6 +59,7 @@ parser.add_argument("-bin", "--binary", help="The binary to run.")
|
|||
parser.add_argument("yaml_files", nargs="*",
|
||||
help="Arbitrary number of YAML files from which to read preset values. "
|
||||
"If two files specify the same values the later one will be used.")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
|
@ -63,6 +67,8 @@ if not known_args.yaml_files and not unknown_args:
|
|||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||
|
||||
props = dict()
|
||||
|
||||
for yaml_file in known_args.yaml_files:
|
||||
|
@ -85,7 +91,7 @@ elif binary.lower().endswith("llama-bench"):
|
|||
elif binary.lower().endswith("server"):
|
||||
cli_args = CLI_ARGS_SERVER
|
||||
else:
|
||||
print(f"Unknown binary: {binary}")
|
||||
logger.error(f"Unknown binary: {binary}")
|
||||
sys.exit(1)
|
||||
|
||||
command_list = [binary]
|
||||
|
@ -121,11 +127,11 @@ for cli_arg in cli_args:
|
|||
|
||||
num_unused = len(props)
|
||||
if num_unused > 10:
|
||||
print(f"The preset file contained a total of {num_unused} unused properties.")
|
||||
logger.info(f"The preset file contained a total of {num_unused} unused properties.")
|
||||
elif num_unused > 0:
|
||||
print("The preset file contained the following unused properties:")
|
||||
logger.info("The preset file contained the following unused properties:")
|
||||
for prop, value in props.items():
|
||||
print(f" {prop}: {value}")
|
||||
logger.info(f" {prop}: {value}")
|
||||
|
||||
command_list += unknown_args
|
||||
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
logger = logging.getLogger("verify-checksum-models")
|
||||
|
||||
|
||||
def sha256sum(file):
|
||||
block_size = 16 * 1024 * 1024 # 16 MB block size
|
||||
|
@ -27,7 +30,7 @@ hash_list_file = os.path.join(llama_path, "SHA256SUMS")
|
|||
|
||||
# Check if the hash list file exists
|
||||
if not os.path.exists(hash_list_file):
|
||||
print(f"Hash list file not found: {hash_list_file}")
|
||||
logger.error(f"Hash list file not found: {hash_list_file}")
|
||||
exit(1)
|
||||
|
||||
# Read the hash file content and split it into an array of lines
|
||||
|
@ -46,7 +49,7 @@ for line in hash_list:
|
|||
file_path = os.path.join(llama_path, filename)
|
||||
|
||||
# Informing user of the progress of the integrity check
|
||||
print(f"Verifying the checksum of {file_path}")
|
||||
logger.info(f"Verifying the checksum of {file_path}")
|
||||
|
||||
# Check if the file exists
|
||||
if os.path.exists(file_path):
|
||||
|
@ -73,9 +76,9 @@ for line in hash_list:
|
|||
|
||||
|
||||
# Print column headers for results table
|
||||
print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20))
|
||||
print("-" * 80)
|
||||
print("filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) # noqa: NP100
|
||||
print("-" * 80) # noqa: NP100
|
||||
|
||||
# Output the results as a table
|
||||
for r in results:
|
||||
print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}")
|
||||
print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") # noqa: NP100
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue