common : simplify

This commit is contained in:
Georgi Gerganov 2024-04-24 13:14:14 +03:00 committed by GitHub
parent 6c081e501c
commit 411137a608
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 2 additions and 23 deletions

View file

@ -2326,21 +2326,6 @@ std::vector<llama_token> llama_tokenize(
return result;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
// duplicate with ability to specify whether to use special token
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);

View file

@ -237,18 +237,12 @@ std::vector<llama_token> llama_tokenize(
bool add_special,
bool parse_special = false);
// tokenizes a token into a piece
// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token
);
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token,
bool special
);
bool special = true);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space