llama : optimize long word tokenization with WPM (#8034)

ggml-ci
2024-06-21 08:51:28 +03:00 · 2024-06-21 08:51:28 +03:00 · a927b0f3dd
commit a927b0f3dd
parent 80ea089d77
2 changed files with 13 additions and 5 deletions
--- a/unicode.cpp
+++ b/unicode.cpp
@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c

 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
+    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(unicode_cpt_from_utf8(utf8, offset));