Clean old known problematic codepoints

2024-06-20 19:25:32 +02:00 · 2024-06-20 19:25:32 +02:00 · 071bf42f23
commit 071bf42f23
parent 03dbcc89f6
2 changed files with 14 additions and 14 deletions
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@ -110,16 +110,16 @@ int main(int argc, char **argv) {
        for (int i = 0; i < nthread; ++i) {
            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                for (uint32_t cp = i; cp < 0x00110000; cp += nthread) {
-                    if (!( // NOLINT
+                    //if (!( // NOLINT
-                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
+                    //            (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
-                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
+                    //            (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
-                                (cp < 0x1c       || cp >  0x1e)   &&
+                    //            (cp < 0x1c       || cp >  0x1e)   &&
-                                (cp < 0xd800     || cp >  0xdfff) &&
+                    //            (cp < 0xd800     || cp >  0xdfff) &&
-                                (cp < 0x00040000 || cp >= 0x000e0000)
+                    //            (cp < 0x00040000 || cp >= 0x000e0000)
-                        )) {
+                    //    )) {
-                        continue;
+                    //    continue;
-                    }
+                    //}
                    std::string str = unicode_cpt_to_utf8(cp);
                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@ -80,10 +80,10 @@ int main(int argc, char ** argv) {
        for (int i = 0; i < nthread; ++i) {
            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                for (uint32_t cp = i; cp < 0x00110000; cp += nthread) {
-                    if (cp >= 0xd800 && cp <= 0xdfff) {
+                    //if (cp >= 0xd800 && cp <= 0xdfff) {
-                        continue;
+                    //    continue;
-                    }
+                    //}
                    std::string str = unicode_cpt_to_utf8(cp);
                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);