convert : add convert-hf-to-gguf-update.py

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-04-28 20:29:32 +03:00
parent ee6d1b3fb4
commit 7642973616
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
5 changed files with 215 additions and 26 deletions

View file

@ -20,5 +20,5 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E222,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py" exclude: "examples/*,examples/*/**,*/**/__init__.py"

View file

@ -0,0 +1,162 @@
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
#
# TODO: generate tokenizer tests for llama.cpp
#
import os
import requests
import sys
import json
from hashlib import sha256
from enum import IntEnum, auto
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български what\'s \'\'\'\'\'\'```````\"\"\"\"......!!!!!!??????'
if len(sys.argv) == 2:
token = sys.argv[1]
else:
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here
models = [
{ "name": "llama-v2", "tokenizer_type": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{ "name": "llama-v3", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{ "name": "deepseek-llm", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat", },
{ "name": "deepseek-coder", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{ "name": "bert-bge", "tokenizer_type": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
print("File downloaded successfully.")
else:
print(f"Failed to download file. Status code: {response.status_code}")
for model in models:
name = model["name"]
repo = model["repo"]
tokenizer_type = model["tokenizer_type"]
if not os.path.exists(f"models/tokenizers/{name}"):
os.makedirs(f"models/tokenizers/{name}")
else:
print(f"Directory models/tokenizers/{name} already exists - skipping")
continue
print(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
if tokenizer_type == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model"
save_path = f"models/tokenizers/{name}/tokenizer.model"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer_config.json"
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = ""
for model in models:
name = model["name"]
tokenizer_type = model["tokenizer_type"]
if tokenizer_type == TOKENIZER_TYPE.SPM:
continue
# create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"model: {name}")
print(f"tokenizer_type: {tokenizer_type}")
print(f"repo: {model['repo']}")
print(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
cfg = json.load(f)
pre_tokenizer = cfg["pre_tokenizer"]
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
print(f"\n")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n"
src_func = ""
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
src_func += "\n"
src_func += f" chktxt = {repr(chktxt)}\n"
src_func += "\n"
src_func += " chktok = tokenizer.encode(chktxt)\n"
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
src_func += "\n"
src_func += " print(f\"chktok: {chktok}\")\n"
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
src_func += "\n"
src_func += " res = None\n"
src_func += "\n"
src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
src_func += f"{src_ifs}\n"
src_func += " if res is None:\n"
src_func += " print(f\"\\n\")\n"
src_func += " print(f\"**************************************************************************************\")\n"
src_func += " print(f\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
src_func += " print(f\"** This means that it was not added yet or you are using an older version.\")\n"
src_func += " print(f\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
src_func += " print(f\"**\")\n"
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
src_func += " print(f\"**************************************************************************************\")\n"
src_func += " print(f\"\\n\")\n"
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
src_func += "\n"
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
src_func += "\n"
src_func += " return res\n"
print(src_func)
print("\n")
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
print("\n")

View file

@ -11,6 +11,7 @@ import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
import numpy as np import numpy as np
@ -376,16 +377,19 @@ class Model(ABC):
return tokens, toktypes, tokpre return tokens, toktypes, tokpre
# NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
def get_vocab_base_pre(self, tokenizer) -> str: def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer # use in llama.cpp to implement the same pre-tokenizer
chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български what's ''''''```````\"\"\"\"......!!!!!!??????" chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български what\'s \'\'\'\'\'\'```````""""......!!!!!!??????'
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(chktxt)
chkhsh = hash(tuple(chktok)) chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"chktok: {chktok}") print(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}") print(f"chkhsh: {chkhsh}")
@ -393,21 +397,34 @@ class Model(ABC):
res = None res = None
# NOTE: if you get an error here, you need to add the model to the if-elif chain below # NOTE: if you get an error here, you need to add the model to the if-elif chain below
# observe the stdout for the chkhsh value and add it to the chain if chkhsh == "0fc850edd52197e357970116fbf58f6c2567f259cdc1bfc3df081d7e4bc658c1":
if self.model_arch == gguf.MODEL_ARCH.LLAMA: # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
if chkhsh == -3290901550109860290: res = "llama-v3"
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a":
res = "llama3" # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat
if chkhsh == 5332289095291046364: res = "deepseek-llm"
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d":
res = "deepseek-llm" # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
if chkhsh == 4190561703949727616: res = "deepseek-coder"
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e":
res = "deepseek-coder" # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
res = "bert-bge"
if res is None: if res is None:
print(f"\n")
print(f"**************************************************************************************")
print(f"** WARNING: The BPE pre-tokenizer was not recognized!")
print(f"** This means that it was not added yet or you are using an older version.")
print(f"** Check convert-hf-to-gguf-update.py and update it accordingly.")
print(f"**")
print(f"** chkhsh: {chkhsh}")
print(f"**************************************************************************************")
print(f"\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
print(f"tokenizer.ggml.pre: {res}")
print(f"chkhsh: {chkhsh}")
return res return res
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:

View file

@ -4330,19 +4330,29 @@ static void llm_load_vocab(
vocab.special_mask_id = -1; vocab.special_mask_id = -1;
} }
if (tokenizer_pre.empty()) { // for now, only BPE models have pre-tokenizers
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; if (tokenizer_pre.empty()) {
} else if (tokenizer_pre == "default") { LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "llama3") { } else if (
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; tokenizer_pre == "default") {
} else if (tokenizer_pre == "deepseek-llm") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; } else if (
} else if (tokenizer_pre == "deepseek-coder") { tokenizer_pre == "llama3" ||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; tokenizer_pre == "llama-v3") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if (
tokenizer_pre == "deepseek-llm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
} else if (
tokenizer_pre == "deepseek-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} }
} }

Binary file not shown.