common : temporary separate llama_detokenize calls for SPM and BPE

2023-08-27 13:04:04 +03:00 · 2023-08-27 13:04:04 +03:00 · 841983fe47
commit 841983fe47
parent 3bb0f84932
4 changed files with 33 additions and 16 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -747,7 +747,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
    return std::string(result.data(), result.size());
 }
-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
    const llama_token bos_id = llama_token_bos(ctx);
    std::string piece;
@ -767,3 +767,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
    return result;
 }
 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        result += piece;
    }
    return result;
 }
--- a/common/common.h
+++ b/common/common.h
@ -129,9 +129,18 @@ std::string llama_token_to_piece(
        const struct llama_context * ctx,
                       llama_token   token);
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
 //
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // removes the leading space from the first non-BOS token
-std::string llama_detokenize(
+std::string llama_detokenize_spm(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -82,10 +82,8 @@ int main(int argc, char **argv) {
        }
    }
-    const int n_vocab = llama_n_vocab(ctx);
+    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
-
+        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
    if (n_vocab != 65024) {
        fprintf(stderr, "%s : expected 65024 tokens, got %d\n", __func__, n_vocab);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
@ -98,7 +96,7 @@ int main(int argc, char **argv) {
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
        printf("tok: ");
        for (const auto & tok : res) {
            printf("%d ", tok);
@ -116,8 +114,8 @@ int main(int argc, char **argv) {
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
+                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
+                llama_detokenize_bpe(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -82,10 +82,8 @@ int main(int argc, char **argv) {
        }
    }
-    const int n_vocab = llama_n_vocab(ctx);
+    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
-
+        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
    if (n_vocab != 32000) {
        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
@ -99,7 +97,7 @@ int main(int argc, char **argv) {
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str());
+        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
        printf("tok: ");
        for (const auto & tok : res_bos) {
            printf("%d ", tok);
@ -120,8 +118,8 @@ int main(int argc, char **argv) {
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res_nobos).c_str(),
+                llama_detokenize_spm(ctx, res_nobos).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
+                llama_detokenize_spm(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);