More vocab conversion fixes

2023-08-20 11:23:13 -06:00 · 2023-08-20 11:23:13 -06:00 · 8083e20d19
commit 8083e20d19
parent 08959c88c2
1 changed files with 6 additions and 6 deletions
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -181,15 +181,15 @@ class GGMLToGGUF:
        print(f'* Adding {hp.n_vocab} vocab item(s)')
        toktypes = []
        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
-            tt = 1
+            tt = 1 # Normal
-            if len(vbytes) > 0 and vbytes[0] == 32:
+            if len(vbytes) == 0:
-                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+                tt = 3 # Control
            elif len(vbytes) == 0:
                tt = 3
            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
                hv = hex(vbytes[0])[2:].upper()
                vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
-                tt = 6
+                tt = 6 # Byte
            else:
                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
            toktypes.append(tt)
            tokens.append(vbytes)
            scores.append(vscore)