From 49c25cce19ce24c4ec90352a81fc01c84462301a Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Mon, 21 Aug 2023 19:11:14 +0200
Subject: [PATCH] tests : use new tokenizer type API (#2692)

* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies

* Adapt convert-new.py (and fix a clang-cl compiler error on windows)

* Improved tokenizer test

But does it work on MacOS?

* Improve token type support

- Added @klosax code to convert.py
- Improved token type support in vocabulary

* Exclude platform dependent tests

* More sentencepiece compatibility by eliminating magic numbers

* Restored accidentally removed comment

* Improve commentary

* Use token type API in test-tokenizer-1.cpp
---
 convert.py                 | 4 ++--
 tests/test-tokenizer-1.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
 mode change 100755 => 100644 convert.py

diff --git a/convert.py b/convert.py
old mode 100755
new mode 100644
index 4ba36f280..0428c229f
--- a/convert.py
+++ b/convert.py
@@ -741,6 +741,8 @@ class OutputFile:
         tokens = []
         scores = []
         toktypes = []
+        # NOTE: `all_tokens` returns the the base vocabulary and added tokens
+        # TODO: add special tokens?
         for text, score, toktype in vocab.all_tokens():
             tokens.append(text)
             scores.append(score)
@@ -751,8 +753,6 @@ class OutputFile:
         self.gguf.add_token_scores(scores)
         self.gguf.add_token_types(toktypes)
 
-        # TODO: added / special tokens
-
     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
         n_elements = 1
         for dim in tensor.shape:
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index a8a7e8898..d8db7cd96 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -87,8 +87,8 @@ int main(int argc, char **argv) {
                 return 2;
             }
         } else {
-            // TODO: needs access to token types
-            if (0 <= i && i < 259) {
+            llama_token_type type = llama_token_get_type(ctx, i);
+            if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
             } else {