Ignore unusable json values

This commit is contained in:
Igor Pissolati 2023-06-20 19:20:53 -03:00
parent ca1fc20508
commit 41a2ed03e7

View file

@ -283,10 +283,12 @@ class SentencePieceVocab:
else: else:
tokenizer_config = {} tokenizer_config = {}
for key, value in tokenizer_config.items(): for key, value in tokenizer_config.items():
assert isinstance(value, dict) or isinstance(value, str) if not isinstance(value, dict) or not isinstance(value, str):
if key not in TOKEN_NAME_TO_ID or TOKEN_NAME_TO_ID[key] == -1:
continue continue
self.special_tokens_map[TOKEN_NAME_TO_ID[key]] = value["content"] if isinstance(value, dict) else value token_id = TOKEN_NAME_TO_ID.get(key, -1)
if token_id == -1:
continue
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
special_tokens: Dict[str, Any] special_tokens: Dict[str, Any]
if fname_special_tokens is not None: if fname_special_tokens is not None:
@ -294,10 +296,9 @@ class SentencePieceVocab:
else: else:
special_tokens = {} special_tokens = {}
for key, value in special_tokens.items(): for key, value in special_tokens.items():
assert isinstance(value, dict) or isinstance(value, str) if not isinstance(value, dict) or not isinstance(value, str):
if key not in TOKEN_NAME_TO_ID:
continue continue
token_id = TOKEN_NAME_TO_ID[key] token_id = TOKEN_NAME_TO_ID.get(key, -1)
if token_id == -1 or token_id in self.special_tokens_map: if token_id == -1 or token_id in self.special_tokens_map:
continue continue
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value