test-tokenizer-0.cpp : spm - add whitespace in front of prompt

This commit is contained in:
klosax 2023-08-26 13:13:05 +02:00 committed by GitHub
parent 43f7c16ad0
commit c50b1ae6b8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -100,7 +100,8 @@ int main(int argc, char **argv) {
bool success = true; bool success = true;
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true); // Add a space in front of the first character to match OG llama tokenizer behavior
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());