support bpe tokenizer in convert, fix

Signed-off-by: ldwang <ftgreat@gmail.com>
2023-07-18 11:18:12 +08:00 · 2023-07-18 11:18:12 +08:00 · 64b8aafce1
commit 64b8aafce1
parent ee6bc1426e
1 changed files with 2 additions and 1 deletions
--- a/convert.py
+++ b/convert.py
@ -1189,6 +1189,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:


 def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
+    print(f"vocabtype: {vocabtype}")
    # Be extra-friendly and accept either a file or a directory.  Also, if it's
    # a directory, it might be the model directory, and tokenizer.model might
    # be in the parent of that.
@ -1210,7 +1211,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
    added_tokens_path = path.parent / "added_tokens.json"
    print(f"Loading vocab file {path}")
    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
-                              vocab_file)
+                              vocabtype)


 def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: