Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
2023-10-10 18:59:52 +02:00 · 2023-10-10 18:59:52 +02:00 · 233fc1c69f
commit 233fc1c69f
parent c5b49360d0
5 changed files with 17 additions and 20 deletions
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -174,10 +174,8 @@ int main(int argc, char **argv) {
            }

            for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
-
-            ofs << "\n";
        }

        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());