Fix spm whitespaces (#2806)

* llama.cpp : fix spm whitespace escaping + clean up * main.cpp : spm - add whitespace in front of prompt * test-tokenizer-0.cpp : spm - add whitespace in front of prompt
2023-08-26 13:45:53 +02:00 · 2023-08-26 13:45:53 +02:00 · 2ba83c8685
commit 2ba83c8685
parent bae5c5f679
3 changed files with 27 additions and 41 deletions
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -100,7 +100,8 @@ int main(int argc, char **argv) {
    bool success = true;

    for (const auto & test_kv : k_tests()) {
-        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());