fix: fix preprocessing jina v2 zh

2024-06-06 10:15:07 +02:00 · 2024-06-06 10:15:07 +02:00 · a8a64fd073
commit a8a64fd073
parent d86efa6c9a
1 changed files with 10 additions and 7 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -13038,13 +13038,17 @@ struct llm_tokenizer_bpe {
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
-                        //TODO: Apply GPT2 + lowercasing
+                        //TODO: Apply lowercase + whitespace pretokenization
                        {
                            std::string lowercase_text = text;
                            std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
-                            word_collection = unicode_regex_split(lowercase_text, {
-                                "",
-                            });
+                            std::regex regexPattern("\\w+|[^\\w\\s]+");
+                            std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
+                            std::sregex_token_iterator end;
+
+                            while (it != end) {
+                                word_collection.push_back(*it++);
+                            }
                        }
                        break;
                    default:
@ -13153,10 +13157,9 @@ struct llm_tokenizer_bpe {
                    for (auto j = str.begin(); j != str.end(); ++j) {
                        std::string byte_str(1, *j);
                        auto token_multibyte = vocab.token_to_id.find(byte_str);
-                        if (token_multibyte == vocab.token_to_id.end()) {
-                            throw std::runtime_error("ERROR: byte not found in vocab");
+                        if (token_multibyte != vocab.token_to_id.end()) {
+                            output.push_back((*token_multibyte).second);
                        }
-                        output.push_back((*token_multibyte).second);
                    }
                } else {
                    output.push_back((*token).second);