From 9b4d63ae53055c03f22649e88811c39f22c33532 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 26 Apr 2024 19:21:55 +0300 Subject: [PATCH] convert : add "tokenizer.ggml.pre" GGUF KV (wip) --- convert-hf-to-gguf.py | 56 +++++++++++++++++++++++++++++++---- convert-llama-ggml-to-gguf.py | 1 + convert-persimmon-to-gguf.py | 1 + gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_reader.py | 2 +- gguf-py/gguf/gguf_writer.py | 3 ++ 6 files changed, 58 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c7d939a40..1aab4d2fe 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -346,7 +346,7 @@ class Model(ABC): raise NotImplementedError(f'Architecture "{arch}" not supported!') # used for GPT-2 BPE and WordPiece vocabs - def get_basic_vocab(self) -> tuple[list[str], list[int]]: + def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] @@ -355,6 +355,8 @@ class Model(ABC): vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size + tokpre = self.get_vocab_base_pre(tokenizer) + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() @@ -372,11 +374,41 @@ class Model(ABC): tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - return tokens, toktypes + return tokens, toktypes, tokpre - def _set_vocab_gpt2(self, tokenizer_model:str = "gpt2") -> None: - tokens, toktypes = self.get_basic_vocab() - self.gguf_writer.add_tokenizer_model(tokenizer_model) + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български what's ''''''```````\"\"\"\"......!!!!!!??????" + + chktok = tokenizer.encode(chktxt) + chkhsh = hash(tuple(chktok)) + + print(f"chktok: {chktok}") + print(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to add the model to the if-elif chain below + # observe the stdout for the chkhsh value and add it to the chain + if self.model_arch == gguf.MODEL_ARCH.LLAMA: + if chkhsh == -3290901550109860290: + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json + res = "llama3" + if chkhsh == 4190561703949727616: + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json + res = "deepseek-coder" + + if res is None: + raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -394,6 +426,8 @@ class Model(ABC): vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size + tokpre = self.get_vocab_base_pre(tokenizer) + merges = [] vocab = {} mergeable_ranks = tokenizer.mergeable_ranks @@ -421,6 +455,7 @@ class Model(ABC): toktypes.append(gguf.TokenType.NORMAL) self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -493,6 +528,7 @@ class Model(ABC): assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -514,6 +550,7 @@ class Model(ABC): assert len(tokens) == vocab.vocab_size self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -957,6 +994,7 @@ class XverseModel(Model): toktypes.append(toktype) self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -2174,6 +2212,7 @@ class Phi3MiniModel(Model): toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -2416,6 +2455,7 @@ class InternLM2Model(Model): toktypes.append(SentencePieceTokenTypes.USER_DEFINED) self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -2565,7 +2605,7 @@ class BertModel(Model): self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): - tokens, toktypes = self.get_basic_vocab() + tokens, toktypes, tokpre = self.get_vocab_base() self.vocab_size = len(tokens) # we need this to validate the size of the token_type embeddings @@ -2583,6 +2623,7 @@ class BertModel(Model): # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -2760,6 +2801,9 @@ class MambaModel(Model): field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) + field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1])) + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index cd9644fcb..5354b748b 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -281,6 +281,7 @@ class GGMLToGGUF: def add_vocab(self, gguf_writer): hp = self.model.hyperparameters gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_tokenizer_pre('default') tokens = [] scores = [] toktypes = [] diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 69be17f94..aba575426 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -99,6 +99,7 @@ def main(): tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir) gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_tokenizer_pre('default') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d2f1de198..6d597bfd9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -72,6 +72,7 @@ class Keys: class Tokenizer: MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" LIST = "tokenizer.ggml.tokens" TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types @@ -940,6 +941,7 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index dc2d65163..cc2abab34 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -140,7 +140,7 @@ class GGUFReader: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: # TODO: add option to generate error on duplicate keys - #raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') + # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') print(f'Warning: Duplicate key {field.name} at offset {field.offset}') self.fields[field.name + '_{}'.format(field.offset)] = field diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e3dbca454..30a55819f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -422,6 +422,9 @@ class GGUFWriter: def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) + def add_tokenizer_pre(self, pre: str) -> None: + self.add_string(Keys.Tokenizer.PRE, pre) + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: self.add_array(Keys.Tokenizer.LIST, tokens)