gguf : clean up SpecialVocab
This commit is contained in:
parent
7fa5cbf8cc
commit
84f7cea2f9
1 changed files with 6 additions and 6 deletions
|
@ -842,7 +842,7 @@ class SpecialVocab:
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if not tokenizer_file.is_file():
|
if not tokenizer_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
tokenizer = json.load(f)
|
tokenizer = json.load(f)
|
||||||
if self.load_merges:
|
if self.load_merges:
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
|
@ -852,7 +852,7 @@ class SpecialVocab:
|
||||||
added_tokens = tokenizer.get('added_tokens')
|
added_tokens = tokenizer.get('added_tokens')
|
||||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
if added_tokens is None or not tokenizer_config_file.is_file():
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
|
@ -875,7 +875,7 @@ class SpecialVocab:
|
||||||
config_file = path / 'config.json'
|
config_file = path / 'config.json'
|
||||||
if not config_file.is_file():
|
if not config_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(config_file, 'r', encoding = 'utf-8') as f:
|
with open(config_file, encoding = 'utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
maybe_token_id = config.get(f'{typ}_token_id')
|
maybe_token_id = config.get(f'{typ}_token_id')
|
||||||
|
@ -883,7 +883,7 @@ class SpecialVocab:
|
||||||
self.special_token_ids[typ] = maybe_token_id
|
self.special_token_ids[typ] = maybe_token_id
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def add_to_gguf(self, gw: GGUFWriter):
|
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
||||||
if len(self.merges) > 0:
|
if len(self.merges) > 0:
|
||||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||||
gw.add_token_merges(self.merges)
|
gw.add_token_merges(self.merges)
|
||||||
|
@ -895,8 +895,8 @@ class SpecialVocab:
|
||||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||||
handler(tokid)
|
handler(tokid)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
|
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
|
||||||
|
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue