Using llama_tokenize() in tests

2024-06-20 18:20:16 +02:00 · 2024-06-20 18:20:16 +02:00 · d779bab49c
commit d779bab49c
parent eea8dfab6b
2 changed files with 25 additions and 49 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2906,51 +2906,35 @@ std::vector<llama_token> llama_tokenize(
 }
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
-    std::vector<char> result(8, 0);
+    std::string piece;
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    if (n_tokens < 0) {
+    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special);
-        result.resize(-n_tokens);
+    if (n_chars < 0) {
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+        piece.resize(-n_chars);
-        GGML_ASSERT(check == -n_tokens);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special);
-    } else {
+        GGML_ASSERT(check == -n_chars);
-        result.resize(n_tokens);
+    }
    else {
        piece.resize(n_chars);
    }
-    return std::string(result.data(), result.size());
+    return piece;
 }
-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
-    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
+    std::string text;
-
+    text.resize(std::max(text.capacity(), tokens.size()));
-    std::string piece;
+    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
-    std::string result;
+    if (n_chars < 0) {
-
+        text.resize(-n_chars);
-    for (size_t i = 0; i < tokens.size(); ++i) {
+        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
-        piece = llama_token_to_piece(ctx, tokens[i]);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
        // remove the leading space of the first non-BOS token
        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
            piece = piece.substr(1);
    }
-        result += piece;
+    text.resize(n_chars);
    }
    return result;
 }
 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        result += piece;
    }
    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return result;
+    return text;
 }
 bool llama_should_add_bos_token(const llama_model * model) {
--- a/common/common.h
+++ b/common/common.h
@ -336,21 +336,13 @@ std::string llama_token_to_piece(
                       llama_token   token,
                       bool          special = true);
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
 //
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
+// optionally renders special/control tokens
-std::string llama_detokenize_spm(
+std::string llama_detokenize(
                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
+        const std::vector<llama_token> & tokens,
-
+                                  bool   special = true);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.