From bb6b64d5e54288e5cba965a842b44d4f8c5427fb Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Sun, 27 Aug 2023 19:59:01 -0600 Subject: [PATCH] Use common special vocab handling in various conversion scripts --- convert-falcon-hf-to-gguf.py | 29 +++------------- convert-gptneox-hf-to-gguf.py | 44 +++--------------------- convert-llama-7b-pth-to-gguf.py | 60 +++------------------------------ convert-llama-ggmlv3-to-gguf.py | 15 ++++++--- convert-llama-hf-to-gguf.py | 60 +++------------------------------ 5 files changed, 27 insertions(+), 181 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index d6a01ec01..de251a0fa 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -13,6 +13,8 @@ from typing import Any, List from pathlib import Path from transformers import AutoTokenizer +from convert import SpecialVocab + def bytes_to_unicode(): # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py """ @@ -116,20 +118,13 @@ print("gguf: get tokenizer metadata") tokens: List[bytearray] = [] scores: List[float] = [] toktypes: List[int] = [] -merges: List[str] = [] - if Path(dir_model + "/tokenizer.json").is_file(): # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") - print("gguf: get gpt2 tokenizer merges") - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: tokenizer_json = json.load(f) - merges = tokenizer_json["model"]["merges"] - - gguf_writer.add_token_merges(merges) print("gguf: get gpt2 tokenizer vocab") @@ -166,24 +161,8 @@ if Path(dir_model + "/tokenizer.json").is_file(): gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) -print("gguf: get special token ids") -# Look for special tokens in config.json - -if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - -if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - -if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - -if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - -if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) - +special_vocab = SpecialVocab(Path(dir_model)) +special_vocab.add_to_gguf(gguf_writer) # TENSORS diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 88ac7ff7b..a7695655d 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -13,6 +13,8 @@ from typing import Any, List from pathlib import Path from transformers import AutoTokenizer +from convert import SpecialVocab + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -112,20 +114,13 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) print("gguf: get tokenizer metadata") tokens: List[bytearray] = [] -merges: List[str] = [] - if Path(dir_model + "/tokenizer.json").is_file(): # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") - print("gguf: get gpt2 tokenizer merges") - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: tokenizer_json = json.load(f) - merges = tokenizer_json["model"]["merges"] - - gguf_writer.add_token_merges(merges) print("gguf: get gpt2 tokenizer vocab") @@ -158,39 +153,8 @@ if Path(dir_model + "/tokenizer.json").is_file(): gguf_writer.add_token_list(tokens) - if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file(): - print("gguf: get special token ids") - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - # find special token ids - - if "bos_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]: - gguf_writer.add_pad_token_id(key["id"]) - +special_vocab = SpecialVocab(Path(dir_model)) +special_vocab.add_to_gguf(gguf_writer) # TENSORS diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index 037b126ed..3c5506fe6 100755 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -15,6 +15,8 @@ from typing import Any, List, TypeAlias from pathlib import Path from sentencepiece import SentencePieceProcessor +from convert import SpecialVocab + #NDArray = np.ndarray[Any, Any] # compatible with python < 3.9 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' @@ -180,62 +182,8 @@ if Path(dir_model + "/tokenizer.model").is_file(): gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) - -print("gguf: get special token ids") - -if Path(dir_model + "/tokenizer.json").is_file(): - # Look for special tokens in tokenizer.json if it exists - - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer = json.load(f) - - if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]["content"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]["content"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]["content"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]["content"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]["content"]: - gguf_writer.add_pad_token_id(key["id"]) -else: - # If no tokenizer.json: Look for special tokens in config.json - - if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - - if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - - if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - - if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - - if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) - +special_vocab = SpecialVocab(Path(dir_model)) +special_vocab.add_to_gguf(gguf_writer) # TENSORS diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index e03cdda70..78ba683ad 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -134,13 +134,14 @@ class GGMLV3Model: return offset class GGMLToGGUF: - def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None): + def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters self.model = ggml_model self.data = data self.cfg = cfg self.params_override = params_override self.vocab_override = vocab_override + self.special_vocab = special_vocab if params_override is not None: n_kv_head = params_override.n_head_kv else: @@ -162,6 +163,8 @@ class GGMLToGGUF: gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) + if self.special_vocab is not None: + self.special_vocab.add_to_gguf(gguf_writer) self.add_tensors(gguf_writer) print(" gguf: write header") gguf_writer.write_header_to_file() @@ -295,8 +298,10 @@ def handle_metadata(cfg, hp): else: raise ValueError('Unable to load metadata') vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) + # FIXME: Respect cfg.vocab_dir? + svocab = convert.SpecialVocab(cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab) + return (params, vocab, svocab) def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') @@ -323,14 +328,16 @@ def main(): print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None + special_vocab = None if cfg.model_metadata_dir is not None: - (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters) + (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') print(f'* Overriding params: {params_override}') print(f'* Overriding vocab: {vocab_override}') + print(f'* Special vocab: {special_vocab}') else: print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') - converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override) + converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py index e8da448ca..5b70bb77d 100755 --- a/convert-llama-hf-to-gguf.py +++ b/convert-llama-hf-to-gguf.py @@ -13,6 +13,8 @@ from typing import Any, List, Optional, TypeAlias from pathlib import Path from sentencepiece import SentencePieceProcessor +from convert import SpecialVocab + #NDArray = np.ndarray[Any, Any] # compatible with python < 3.9 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' @@ -189,62 +191,8 @@ if Path(dir_model + "/tokenizer.model").is_file(): gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) - -print("gguf: get special token ids") - -if Path(dir_model + "/tokenizer.json").is_file(): - # Look for special tokens in tokenizer.json if it exists - - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer = json.load(f) - - if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]["content"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]["content"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]["content"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]["content"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]["content"]: - gguf_writer.add_pad_token_id(key["id"]) -else: - # If no tokenizer.json: Look for special tokens in config.json - - if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - - if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - - if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - - if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - - if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) - +special_vocab = SpecialVocab(Path(dir_model)) +special_vocab.add_to_gguf(gguf_writer) # TENSORS