llama : optimize long word tokenization with WPM (#8034)

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-06-21 08:51:28 +03:00 committed by GitHub
parent 80ea089d77
commit a927b0f3dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 13 additions and 5 deletions

View file

@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));