From 4abbfb51f91ce705d3ce91f1f546b0b3816ab475 Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 19 Sep 2023 19:37:25 +0200 Subject: [PATCH] Adding a comment --- tests/test-tokenizer-1-bpe.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 863848a82..a2b74dadf 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -77,6 +77,7 @@ int main(int argc, char **argv) { } for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { + // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) { std::string str = " " + codepoint_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false);