Use common special vocab handling in various conversion scripts
This commit is contained in:
parent
120ed6453f
commit
bb6b64d5e5
5 changed files with 27 additions and 181 deletions
|
@ -13,6 +13,8 @@ from typing import Any, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from convert import SpecialVocab
|
||||||
|
|
||||||
def bytes_to_unicode():
|
def bytes_to_unicode():
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
"""
|
"""
|
||||||
|
@ -116,20 +118,13 @@ print("gguf: get tokenizer metadata")
|
||||||
tokens: List[bytearray] = []
|
tokens: List[bytearray] = []
|
||||||
scores: List[float] = []
|
scores: List[float] = []
|
||||||
toktypes: List[int] = []
|
toktypes: List[int] = []
|
||||||
merges: List[str] = []
|
|
||||||
|
|
||||||
|
|
||||||
if Path(dir_model + "/tokenizer.json").is_file():
|
if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer merges")
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_json = json.load(f)
|
tokenizer_json = json.load(f)
|
||||||
merges = tokenizer_json["model"]["merges"]
|
|
||||||
|
|
||||||
gguf_writer.add_token_merges(merges)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
@ -166,24 +161,8 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
print("gguf: get special token ids")
|
special_vocab = SpecialVocab(Path(dir_model))
|
||||||
# Look for special tokens in config.json
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
|
||||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
|
||||||
|
|
||||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
|
||||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
|
||||||
|
|
||||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
|
||||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
|
||||||
|
|
||||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
|
||||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
|
||||||
|
|
||||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
|
||||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,8 @@ from typing import Any, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from convert import SpecialVocab
|
||||||
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
|
|
||||||
|
|
||||||
|
@ -112,20 +114,13 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: List[bytearray] = []
|
tokens: List[bytearray] = []
|
||||||
merges: List[str] = []
|
|
||||||
|
|
||||||
|
|
||||||
if Path(dir_model + "/tokenizer.json").is_file():
|
if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer merges")
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_json = json.load(f)
|
tokenizer_json = json.load(f)
|
||||||
merges = tokenizer_json["model"]["merges"]
|
|
||||||
|
|
||||||
gguf_writer.add_token_merges(merges)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
@ -158,39 +153,8 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
|
||||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
special_vocab = SpecialVocab(Path(dir_model))
|
||||||
print("gguf: get special token ids")
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_config = json.load(f)
|
|
||||||
|
|
||||||
# find special token ids
|
|
||||||
|
|
||||||
if "bos_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["bos_token"]:
|
|
||||||
gguf_writer.add_bos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "eos_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["eos_token"]:
|
|
||||||
gguf_writer.add_eos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "unk_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["unk_token"]:
|
|
||||||
gguf_writer.add_unk_token_id(key["id"])
|
|
||||||
|
|
||||||
if "sep_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["sep_token"]:
|
|
||||||
gguf_writer.add_sep_token_id(key["id"])
|
|
||||||
|
|
||||||
if "pad_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["pad_token"]:
|
|
||||||
gguf_writer.add_pad_token_id(key["id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,8 @@ from typing import Any, List, TypeAlias
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
from convert import SpecialVocab
|
||||||
|
|
||||||
#NDArray = np.ndarray[Any, Any]
|
#NDArray = np.ndarray[Any, Any]
|
||||||
# compatible with python < 3.9
|
# compatible with python < 3.9
|
||||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||||
|
@ -180,62 +182,8 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = SpecialVocab(Path(dir_model))
|
||||||
print("gguf: get special token ids")
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
if Path(dir_model + "/tokenizer.json").is_file():
|
|
||||||
# Look for special tokens in tokenizer.json if it exists
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
|
||||||
tokenizer = json.load(f)
|
|
||||||
|
|
||||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_config = json.load(f)
|
|
||||||
|
|
||||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
|
||||||
gguf_writer.add_bos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
|
||||||
gguf_writer.add_eos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
|
||||||
gguf_writer.add_unk_token_id(key["id"])
|
|
||||||
|
|
||||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
|
||||||
gguf_writer.add_sep_token_id(key["id"])
|
|
||||||
|
|
||||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
|
||||||
gguf_writer.add_pad_token_id(key["id"])
|
|
||||||
else:
|
|
||||||
# If no tokenizer.json: Look for special tokens in config.json
|
|
||||||
|
|
||||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
|
||||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
|
||||||
|
|
||||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
|
||||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
|
||||||
|
|
||||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
|
||||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
|
||||||
|
|
||||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
|
||||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
|
||||||
|
|
||||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
|
||||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
|
|
|
@ -134,13 +134,14 @@ class GGMLV3Model:
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
self.model = ggml_model
|
self.model = ggml_model
|
||||||
self.data = data
|
self.data = data
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.params_override = params_override
|
self.params_override = params_override
|
||||||
self.vocab_override = vocab_override
|
self.vocab_override = vocab_override
|
||||||
|
self.special_vocab = special_vocab
|
||||||
if params_override is not None:
|
if params_override is not None:
|
||||||
n_kv_head = params_override.n_head_kv
|
n_kv_head = params_override.n_head_kv
|
||||||
else:
|
else:
|
||||||
|
@ -162,6 +163,8 @@ class GGMLToGGUF:
|
||||||
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
|
if self.special_vocab is not None:
|
||||||
|
self.special_vocab.add_to_gguf(gguf_writer)
|
||||||
self.add_tensors(gguf_writer)
|
self.add_tensors(gguf_writer)
|
||||||
print(" gguf: write header")
|
print(" gguf: write header")
|
||||||
gguf_writer.write_header_to_file()
|
gguf_writer.write_header_to_file()
|
||||||
|
@ -295,8 +298,10 @@ def handle_metadata(cfg, hp):
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
||||||
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
|
svocab = convert.SpecialVocab(cfg.model_metadata_dir)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
||||||
|
@ -323,14 +328,16 @@ def main():
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
|
special_vocab = None
|
||||||
if cfg.model_metadata_dir is not None:
|
if cfg.model_metadata_dir is not None:
|
||||||
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
|
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
|
||||||
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
||||||
print(f'* Overriding params: {params_override}')
|
print(f'* Overriding params: {params_override}')
|
||||||
print(f'* Overriding vocab: {vocab_override}')
|
print(f'* Overriding vocab: {vocab_override}')
|
||||||
|
print(f'* Special vocab: {special_vocab}')
|
||||||
else:
|
else:
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
|
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,8 @@ from typing import Any, List, Optional, TypeAlias
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
from convert import SpecialVocab
|
||||||
|
|
||||||
#NDArray = np.ndarray[Any, Any]
|
#NDArray = np.ndarray[Any, Any]
|
||||||
# compatible with python < 3.9
|
# compatible with python < 3.9
|
||||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||||
|
@ -189,62 +191,8 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = SpecialVocab(Path(dir_model))
|
||||||
print("gguf: get special token ids")
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
if Path(dir_model + "/tokenizer.json").is_file():
|
|
||||||
# Look for special tokens in tokenizer.json if it exists
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
|
||||||
tokenizer = json.load(f)
|
|
||||||
|
|
||||||
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
|
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_config = json.load(f)
|
|
||||||
|
|
||||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["bos_token"]["content"]:
|
|
||||||
gguf_writer.add_bos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["eos_token"]["content"]:
|
|
||||||
gguf_writer.add_eos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["unk_token"]["content"]:
|
|
||||||
gguf_writer.add_unk_token_id(key["id"])
|
|
||||||
|
|
||||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["sep_token"]["content"]:
|
|
||||||
gguf_writer.add_sep_token_id(key["id"])
|
|
||||||
|
|
||||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
|
|
||||||
for key in tokenizer["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["pad_token"]["content"]:
|
|
||||||
gguf_writer.add_pad_token_id(key["id"])
|
|
||||||
else:
|
|
||||||
# If no tokenizer.json: Look for special tokens in config.json
|
|
||||||
|
|
||||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
|
||||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
|
||||||
|
|
||||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
|
||||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
|
||||||
|
|
||||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
|
||||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
|
||||||
|
|
||||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
|
||||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
|
||||||
|
|
||||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
|
||||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue