llama : tokenizer fixes (#2549)

* Merge tokenizer fixes into the gguf branch. * Add test vocabularies
2023-08-14 18:30:28 +02:00 · 2023-08-14 18:30:28 +02:00 · ec1b100720
commit ec1b100720
parent 8af3a99ff1
17 changed files with 612 additions and 147 deletions
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }