Fix spm whitespaces (#2806)

* llama.cpp : fix spm whitespace escaping + clean up

* main.cpp : spm - add whitespace in front of prompt

* test-tokenizer-0.cpp : spm - add whitespace in front of prompt
This commit is contained in:
klosax 2023-08-26 13:45:53 +02:00 committed by GitHub
parent bae5c5f679
commit 2ba83c8685
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 27 additions and 41 deletions

View file

@ -100,7 +100,8 @@ int main(int argc, char **argv) {
bool success = true;
for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
// Add a space in front of the first character to match OG llama tokenizer behavior
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());