From 8083e20d1948844cdf66abead57dd30a822afd04 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Sun, 20 Aug 2023 11:23:13 -0600
Subject: [PATCH] More vocab conversion fixes

---
 convert-llama-ggmlv3-to-gguf.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py
index 58fd3a09a..e8ecdf6da 100644
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -181,15 +181,15 @@ class GGMLToGGUF:
         print(f'* Adding {hp.n_vocab} vocab item(s)')
         toktypes = []
         for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
-            tt = 1
-            if len(vbytes) > 0 and vbytes[0] == 32:
-                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
-            elif len(vbytes) == 0:
-                tt = 3
+            tt = 1 # Normal
+            if len(vbytes) == 0:
+                tt = 3 # Control
             elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
                 hv = hex(vbytes[0])[2:].upper()
                 vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
-                tt = 6
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
             toktypes.append(tt)
             tokens.append(vbytes)
             scores.append(vscore)