fix: only do pre tokenization and normalization

2024-05-02 16:39:58 +02:00 · 2024-05-02 16:39:58 +02:00 · 0f94ff7155
commit 0f94ff7155
parent 0672cd8f42
1 changed files with 3 additions and 5 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -271,12 +271,10 @@ class Model(ABC):
        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
-        token_ids = tokenizer.encode(chktxt)
+        pre_out = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
-        token_list = tokenizer.convert_ids_to_tokens(token_ids)
+        chkhsh = sha256(str(pre_out).encode()).hexdigest()
        chkhsh = sha256(str(token_list).encode()).hexdigest()
-        print(f"token_ids: {token_ids}")
+        print(f"pre_out: {pre_out}")
        print(f"token_list: {token_list}")
        print(f"chkhsh: {chkhsh}")
        res = None