More vocab conversion fixes
This commit is contained in:
parent
08959c88c2
commit
8083e20d19
1 changed files with 6 additions and 6 deletions
|
@ -181,15 +181,15 @@ class GGMLToGGUF:
|
||||||
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||||
toktypes = []
|
toktypes = []
|
||||||
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||||
tt = 1
|
tt = 1 # Normal
|
||||||
if len(vbytes) > 0 and vbytes[0] == 32:
|
if len(vbytes) == 0:
|
||||||
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
|
tt = 3 # Control
|
||||||
elif len(vbytes) == 0:
|
|
||||||
tt = 3
|
|
||||||
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
||||||
hv = hex(vbytes[0])[2:].upper()
|
hv = hex(vbytes[0])[2:].upper()
|
||||||
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
|
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
|
||||||
tt = 6
|
tt = 6 # Byte
|
||||||
|
else:
|
||||||
|
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
|
||||||
toktypes.append(tt)
|
toktypes.append(tt)
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(vscore)
|
scores.append(vscore)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue