Improve UNK, BOS, EOS token handling when converting without metadata.
Allow importing as a module. Remove some obsolete code and minor cleanups.
This commit is contained in:
parent
3b6cfe7c92
commit
14ed02e8d4
1 changed files with 22 additions and 11 deletions
|
@ -215,15 +215,10 @@ class GGMLToGGUF:
|
|||
if self.vocab_override is not None:
|
||||
vo = self.vocab_override
|
||||
print('* Adding vocab item(s)')
|
||||
for (idx, vitem) in enumerate(vo.all_tokens()):
|
||||
if len(vitem) == 3:
|
||||
tokens.append(vitem[0])
|
||||
scores.append(vitem[1])
|
||||
toktypes.append(vitem[2])
|
||||
else:
|
||||
# Maybe try to guess the token type here?
|
||||
tokens.append(vitem[0])
|
||||
scores.append(vitem[1])
|
||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||
tokens.append(vbytes)
|
||||
scores.append(score)
|
||||
toktypes.append(ttype)
|
||||
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
|
@ -231,9 +226,21 @@ class GGMLToGGUF:
|
|||
gguf_writer.add_token_types(toktypes)
|
||||
return
|
||||
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
|
||||
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||
tt = 1 # Normal
|
||||
if len(vbytes) == 0:
|
||||
# Special handling for UNK, BOS, EOS tokens.
|
||||
if tokid <= 2:
|
||||
if tokid == 0:
|
||||
vbytes = b'<unk>'
|
||||
tt = 2
|
||||
elif tokid == 1:
|
||||
vbytes = b'<s>'
|
||||
tt = 3
|
||||
else:
|
||||
vbytes = b'</s>'
|
||||
tt = 3
|
||||
elif len(vbytes) == 0:
|
||||
tt = 3 # Control
|
||||
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
||||
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
|
||||
|
@ -246,6 +253,9 @@ class GGMLToGGUF:
|
|||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
gguf_writer.add_unk_token_id(0)
|
||||
gguf_writer.add_bos_token_id(1)
|
||||
gguf_writer.add_eos_token_id(2)
|
||||
|
||||
def add_tensors(self, gguf_writer):
|
||||
nm = self.name_map
|
||||
|
@ -330,4 +340,5 @@ def main():
|
|||
converter.save()
|
||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||
|
||||
main()
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue