From 9b464b4e81dce67371de0b33537c7af3b9960c45 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Jan 2024 13:38:54 +0200 Subject: [PATCH 1/4] py : fix missing added_tokens_dict for SPM vocab --- convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert.py b/convert.py index 3b613eefc..316028592 100755 --- a/convert.py +++ b/convert.py @@ -466,6 +466,7 @@ class SentencePieceVocab: # LlaMa ) # Token pieces that were added to the base vocabulary. + self.added_tokens_dict = added_tokens self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] self.vocab_size_base = vocab_size self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) From a1372737e0893490864e202bb8e48d14ab97fd2b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Jan 2024 14:03:57 +0200 Subject: [PATCH 2/4] py : pad with unknown tokens when data is missing ggml-ci --- convert.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/convert.py b/convert.py index 316028592..dfd4c4621 100755 --- a/convert.py +++ b/convert.py @@ -1098,6 +1098,15 @@ class OutputFile: scores.append(score) toktypes.append(toktype) + # pad with unknown tokens and print warnings + # ref: https://github.com/ggerganov/llama.cpp/issues/4958 + if len(tokens) < vocab.vocab_size: + for i in range(len(tokens), vocab.vocab_size): + tokens.append(f"".encode("utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNKNOWN) + print(f"Warning: token {i} not found in vocab - padding with {tokens[-1]}") + return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: From d92351e23deef2bb9cf944a41a9a9e363d2e6de0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Jan 2024 14:47:07 +0200 Subject: [PATCH 3/4] py : fix BPE vocab conversion ggml-ci --- convert.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/convert.py b/convert.py index dfd4c4621..49cf5c379 100755 --- a/convert.py +++ b/convert.py @@ -387,6 +387,7 @@ class BpeVocab: # GPT self.bpe_tokenizer = json.loads( open(str(fname_tokenizer), encoding="utf-8").read() ) + self.vocab = self.bpe_tokenizer["model"]["vocab"] added_tokens: dict[str, int] if fname_added_tokens is not None: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. @@ -405,7 +406,7 @@ class BpeVocab: # GPT if item["content"] not in self.bpe_tokenizer ) - vocab_size: int = len(self.bpe_tokenizer) + vocab_size: int = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: @@ -415,6 +416,7 @@ class BpeVocab: # GPT ) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_dict = added_tokens self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) @@ -422,10 +424,9 @@ class BpeVocab: # GPT self.fname_added_tokens = fname_added_tokens def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.bpe_tokenizer - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - for i, _ in enumerate(tokenizer): + for i, _ in enumerate(self.vocab): yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -1383,15 +1384,14 @@ class VocabFactory: self.files[file] = file_path elif parent_file_path.exists(): self.files[file] = parent_file_path + print(f"Found vocab files: {self.files}") def _select_file(self, vocabtype: Optional[str]) -> Path: if vocabtype in ["spm", "bpe"]: - # For SentencePiece and BPE, return specific files as before - file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json" - if self.files[file_key]: - return self.files[file_key] - else: - raise FileNotFoundError(f"{vocabtype} {file_key} not found.") + for file_key in self.files.keys(): + if self.files[file_key]: + return self.files[file_key] + raise FileNotFoundError(f"{vocabtype} vocab not found.") elif vocabtype == "hfft": # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file return self.path From 23742deb5beb0e320bfb171545f595b8e60bccd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Jan 2024 15:44:22 +0200 Subject: [PATCH 4/4] py : fix padded dummy tokens (I hope) --- convert.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/convert.py b/convert.py index 49cf5c379..b47bb6185 100755 --- a/convert.py +++ b/convert.py @@ -1008,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N ) for i in range(1, pad_count + 1): vocab.added_tokens_dict[f""] = -1 + vocab.added_tokens_list.append(f"") vocab.vocab_size = params.n_vocab return @@ -1099,14 +1100,7 @@ class OutputFile: scores.append(score) toktypes.append(toktype) - # pad with unknown tokens and print warnings - # ref: https://github.com/ggerganov/llama.cpp/issues/4958 - if len(tokens) < vocab.vocab_size: - for i in range(len(tokens), vocab.vocab_size): - tokens.append(f"".encode("utf-8")) - scores.append(-1000.0) - toktypes.append(gguf.TokenType.UNKNOWN) - print(f"Warning: token {i} not found in vocab - padding with {tokens[-1]}") + assert(len(tokens) == vocab.vocab_size) return tokens, scores, toktypes