From 1545550ec262b866f9647dae3cb1041ef89ce390 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 Apr 2024 21:40:36 +0300 Subject: [PATCH] unicode : normalize signatures --- unicode.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unicode.cpp b/unicode.cpp index 214f78658..b47e87d46 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -363,7 +363,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } // use std::wregex to split the text -static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::vector & offsets, const std::wstring & regex_expr) { +static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { std::wregex expr(regex_expr); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size @@ -393,7 +393,7 @@ static std::vector unicode_regex_split_stl(const std::wstring & wtext, c } // use std::regex to split the text -static std::vector unicode_regex_split_stl(const std::string & text, const std::vector & offsets, const std::string & regex_expr) { +static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { std::regex expr(regex_expr); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size @@ -422,10 +422,10 @@ static std::vector unicode_regex_split_stl(const std::string & text, con return bpe_offsets; } -static std::vector unicode_regex_split_custom(const std::string & regex, const std::string & text, const std::vector & offsets) { +static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { std::vector bpe_offsets; - if (regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); } @@ -548,7 +548,7 @@ std::vector unicode_regex_split(const std::string & text, const std const auto cpts = unicode_cpts_from_utf8(text); - // generated a "collapsed" representation of the text, where all codepoints are replaced by a single byte + // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 std::string text_collapsed; if (need_collapse) { @@ -576,7 +576,7 @@ std::vector unicode_regex_split(const std::string & text, const std for (auto & regex_expr : regex_exprs) { // first, see if we have an efficient custom regex implementation - auto tmp = unicode_regex_split_custom(regex_expr, text, bpe_offsets); + auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); if (!tmp.empty()) { bpe_offsets = std::move(tmp); @@ -644,7 +644,7 @@ std::vector unicode_regex_split(const std::string & text, const std //printf("text_collapsed: %s\n", text_collapsed.c_str()); //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str()); - bpe_offsets = unicode_regex_split_stl(text_collapsed, bpe_offsets, regex_expr_collapsed); + bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); } else { // no unicode category used, we can use std::wregex directly const std::wstring wtext = unicode_wstring_from_utf8(text); @@ -652,7 +652,7 @@ std::vector unicode_regex_split(const std::string & text, const std //printf("text: %s\n", text.c_str()); //printf("regex_expr: %s\n", regex_expr.c_str()); - bpe_offsets = unicode_regex_split_stl(wtext, bpe_offsets, wregex_expr); + bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); } } catch (std::regex_error & e) { fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());