llama : more tokenizer fixes (#2810)
* tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
This commit is contained in:
parent
1591e2e590
commit
edd4c14817
20 changed files with 671 additions and 224 deletions
|
@ -733,12 +733,12 @@ std::vector<llama_token> llama_tokenize(
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
@ -746,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
|||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_token bos_id = llama_token_bos(ctx);
|
||||
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
// remove the leading space of the first non-BOS token
|
||||
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
||||
piece = piece.substr(1);
|
||||
}
|
||||
|
||||
result += piece;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
result += piece;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue