From 9da4e667103414ef237c5b388ede36f44f16de44 Mon Sep 17 00:00:00 2001
From: Ronsor <ronsor@ronsor.pw>
Date: Tue, 14 Mar 2023 13:33:41 -0700
Subject: [PATCH] Use `tokenizer.vocab_size()` instead of hardcoding 32000 in
 convert-pth-to-ggml.py

There are ways that special tokens or other new tokens could be added to the tokenizer; therefore it's probably best not to assume the vocabulary is only 32000 tokens.
---
 convert-pth-to-ggml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index d2557500a..5c36e9c09 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -99,7 +99,7 @@ for p in range(n_parts):
     fout.write(struct.pack("i", ftype))
 
     # Is this correct??
-    for i in range(32000):
+    for i in range(tokenizer.vocab_size()):
         if tokenizer.is_unknown(i):
             # "<unk>" token (translated as ??)
             text = " \u2047 ".encode("utf-8")