From ce5485aee0ffe5c7b326289037e864a129ec31d5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Apr 2024 17:11:34 +0300
Subject: [PATCH] unicode : always use std::wregex

---
 tests/test-tokenizer-0-deepseek-llm.cpp |  6 ++---
 unicode-data.cpp                        |  2 +-
 unicode.cpp                             | 33 +++----------------------
 3 files changed, 6 insertions(+), 35 deletions(-)
diff --git a/tests/test-tokenizer-0-deepseek-llm.cpp b/tests/test-tokenizer-0-deepseek-llm.cpp
index c621e02d9..e21d16c88 100644
--- a/tests/test-tokenizer-0-deepseek-llm.cpp
+++ b/tests/test-tokenizer-0-deepseek-llm.cpp
@@ -130,16 +130,14 @@ int main(int argc, char **argv) {
                 llama_detokenize_bpe(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
             }
             fprintf(stderr, "\n");
             fprintf(stderr, "%s : got tokens:      ", __func__);
             for (const auto & t : res) {
-                fprintf(stderr, "%6d, ", t);
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
             }
             fprintf(stderr, "\n");
-
-            success = false;
         }
     }
 
diff --git a/unicode-data.cpp b/unicode-data.cpp
index 526b69865..d36983601 100644
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
@@ -1,4 +1,4 @@
-﻿#include "unicode-data.h"
+#include "unicode-data.h"
 
 #include <cstdint>
 #include <map>
diff --git a/unicode.cpp b/unicode.cpp
index 2e59c0722..388e92379 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -391,35 +391,6 @@ static std::vector<size_t> unicode_regex_preprocess(const std::wstring & text, c
     return bpe_offsets;
 }
 
-static std::vector<size_t> unicode_regex_preprocess_fallback(const std::string & text, const std::vector<size_t> & offsets, const std::string & regex_expr) {
-    std::regex expr(regex_expr);
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
-    for (auto offset : offsets) {
-        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
-        std::cregex_iterator end;
-
-        int64_t start_idx = 0;
-        while (it != end) {
-            std::cmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
-            }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
-            ++it;
-        }
-
-        if (start_idx < (int64_t) offset) {
-            bpe_offsets.emplace_back(offset - start_idx);
-        }
-        start += offset;
-    }
-
-    return bpe_offsets;
-}
-
 static bool unicode_regex_equivalent_wregex_exists(const std::string & regex) {
     return unicode_regex_equivalent_wregex.find(regex) != unicode_regex_equivalent_wregex.end();
 }
@@ -532,8 +503,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr);
             bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
         } else {
+            // fallback
             try {
-                bpe_offsets = unicode_regex_preprocess_fallback(text, bpe_offsets, regex_expr);
+                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+                bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
             } catch (std::regex_error & e) {
                 fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
                 fprintf(stderr, "Regex error: %s\n", e.what());