From 49c25cce19ce24c4ec90352a81fc01c84462301a Mon Sep 17 00:00:00 2001 From: goerch Date: Mon, 21 Aug 2023 19:11:14 +0200 Subject: [PATCH] tests : use new tokenizer type API (#2692) * Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp --- convert.py | 4 ++-- tests/test-tokenizer-1.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) mode change 100755 => 100644 convert.py diff --git a/convert.py b/convert.py old mode 100755 new mode 100644 index 4ba36f280..0428c229f --- a/convert.py +++ b/convert.py @@ -741,6 +741,8 @@ class OutputFile: tokens = [] scores = [] toktypes = [] + # NOTE: `all_tokens` returns the the base vocabulary and added tokens + # TODO: add special tokens? for text, score, toktype in vocab.all_tokens(): tokens.append(text) scores.append(score) @@ -751,8 +753,6 @@ class OutputFile: self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) - # TODO: added / special tokens - def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: n_elements = 1 for dim in tensor.shape: diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index a8a7e8898..d8db7cd96 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -87,8 +87,8 @@ int main(int argc, char **argv) { return 2; } } else { - // TODO: needs access to token types - if (0 <= i && i < 259) { + llama_token_type type = llama_token_get_type(ctx, i); + if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) { fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); } else {