gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
This commit is contained in:
parent
eef5ae3898
commit
a169862c51
2 changed files with 16 additions and 11 deletions
|
@ -117,17 +117,18 @@ class SpecialVocab:
|
||||||
|
|
||||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if not tokenizer_file.is_file():
|
if tokenizer_file.is_file():
|
||||||
return False
|
|
||||||
with open(tokenizer_file, encoding = 'utf-8') as f:
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
tokenizer = json.load(f)
|
tokenizer = json.load(f)
|
||||||
if self.load_merges:
|
if self.load_merges:
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
||||||
self.merges = merges
|
self.merges = merges
|
||||||
|
added_tokens = tokenizer.get('added_tokens', {})
|
||||||
|
else:
|
||||||
|
added_tokens = {}
|
||||||
tokenizer_config_file = path / 'tokenizer_config.json'
|
tokenizer_config_file = path / 'tokenizer_config.json'
|
||||||
added_tokens = tokenizer.get('added_tokens')
|
if not tokenizer_config_file.is_file():
|
||||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
|
@ -135,6 +136,10 @@ class SpecialVocab:
|
||||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
if isinstance(add_entry, bool):
|
if isinstance(add_entry, bool):
|
||||||
self.add_special_token[typ] = add_entry
|
self.add_special_token[typ] = add_entry
|
||||||
|
if not added_tokens:
|
||||||
|
# We will need this to get the content for the token, so if it's empty
|
||||||
|
# may as well just give up.
|
||||||
|
continue
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
if isinstance(entry, str):
|
if isinstance(entry, str):
|
||||||
tc_content = entry
|
tc_content = entry
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.5.2"
|
version = "0.5.3"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue