From c50b1ae6b8efc405f3603e54b3f64bcefe5ff510 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Sat, 26 Aug 2023 13:13:05 +0200 Subject: [PATCH] test-tokenizer-0.cpp : spm - add whitespace in front of prompt --- tests/test-tokenizer-0.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index f3ee851a3..7e9ac9188 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -100,7 +100,8 @@ int main(int argc, char **argv) { bool success = true; for (const auto & test_kv : k_tests()) { - std::vector res = llama_tokenize(ctx, test_kv.first, true); + // Add a space in front of the first character to match OG llama tokenizer behavior + std::vector res = llama_tokenize(ctx, " " + test_kv.first, true); fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());