gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time

This commit is contained in:
KerfuffleV2 2023-11-13 18:38:17 -07:00
parent eef5ae3898
commit a169862c51
2 changed files with 16 additions and 11 deletions

View file

@ -117,17 +117,18 @@ class SpecialVocab:
def _try_load_from_tokenizer_json(self, path: Path) -> bool: def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json' tokenizer_file = path / 'tokenizer.json'
if not tokenizer_file.is_file(): if tokenizer_file.is_file():
return False with open(tokenizer_file, encoding = 'utf-8') as f:
with open(tokenizer_file, encoding = 'utf-8') as f: tokenizer = json.load(f)
tokenizer = json.load(f) if self.load_merges:
if self.load_merges: merges = tokenizer.get('model', {}).get('merges')
merges = tokenizer.get('model', {}).get('merges') if isinstance(merges, list) and merges and isinstance(merges[0], str):
if isinstance(merges, list) and merges and isinstance(merges[0], str): self.merges = merges
self.merges = merges added_tokens = tokenizer.get('added_tokens', {})
else:
added_tokens = {}
tokenizer_config_file = path / 'tokenizer_config.json' tokenizer_config_file = path / 'tokenizer_config.json'
added_tokens = tokenizer.get('added_tokens') if not tokenizer_config_file.is_file():
if added_tokens is None or not tokenizer_config_file.is_file():
return True return True
with open(tokenizer_config_file, encoding = 'utf-8') as f: with open(tokenizer_config_file, encoding = 'utf-8') as f:
tokenizer_config = json.load(f) tokenizer_config = json.load(f)
@ -135,6 +136,10 @@ class SpecialVocab:
add_entry = tokenizer_config.get(f'add_{typ}_token') add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool): if isinstance(add_entry, bool):
self.add_special_token[typ] = add_entry self.add_special_token[typ] = add_entry
if not added_tokens:
# We will need this to get the content for the token, so if it's empty
# may as well just give up.
continue
entry = tokenizer_config.get(f'{typ}_token') entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str): if isinstance(entry, str):
tc_content = entry tc_content = entry

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.5.2" version = "0.5.3"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [