More vocab conversion fixes

This commit is contained in:
KerfuffleV2 2023-08-20 11:23:13 -06:00
parent 08959c88c2
commit 8083e20d19

View file

@ -181,15 +181,15 @@ class GGMLToGGUF:
print(f'* Adding {hp.n_vocab} vocab item(s)') print(f'* Adding {hp.n_vocab} vocab item(s)')
toktypes = [] toktypes = []
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 tt = 1 # Normal
if len(vbytes) > 0 and vbytes[0] == 32: if len(vbytes) == 0:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81') tt = 3 # Control
elif len(vbytes) == 0:
tt = 3
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
hv = hex(vbytes[0])[2:].upper() hv = hex(vbytes[0])[2:].upper()
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8') vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
tt = 6 tt = 6 # Byte
else:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
toktypes.append(tt) toktypes.append(tt)
tokens.append(vbytes) tokens.append(vbytes)
scores.append(vscore) scores.append(vscore)