From a04bdfb4fa65255771f719857cc63308968d54ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 22 Mar 2024 22:10:16 +0100 Subject: [PATCH] Fallback to tokenizer.json if vocab.json does not exist --- convert.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/convert.py b/convert.py index 817cb6612..15095cc47 100755 --- a/convert.py +++ b/convert.py @@ -1293,7 +1293,7 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: - _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"} + _FILES = {"spm": ["tokenizer.model"], "bpe": ["vocab.json", "tokenizer.json"], "hfft": ["tokenizer.json"]} def __init__(self, path: Path): self.path = path @@ -1301,11 +1301,12 @@ class VocabFactory: print(f"Found vocab files: {self.file_paths}") def _detect_files(self) -> dict[str, Path | None]: - def locate(file: str) -> Path | None: - if (path := self.path / file).exists(): - return path - if (path := self.path.parent / file).exists(): - return path + def locate(files: list[str]) -> Path | None: + for file in files: + if (path := self.path / file).exists(): + return path + if (path := self.path.parent / file).exists(): + return path return None return {vt: locate(f) for vt, f in self._FILES.items()}