Fixing the last deviations from sentencepiece indicated by test-tokenizer-1

This commit is contained in:
goerch 2023-09-14 17:05:04 +02:00
parent 16bf5f26ea
commit 64b0b7453e
4 changed files with 13 additions and 9 deletions

View file

@ -791,10 +791,10 @@ std::vector<llama_token> llama_tokenize(
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
n_tokens = llama_tokenize(ctx, text.c_str(), text.length(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
int check = llama_tokenize(ctx, text.c_str(), text.length(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);

View file

@ -6202,19 +6202,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
int llama_tokenize(
struct llama_context * ctx,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
}
int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
if (n_max_tokens < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);

View file

@ -374,6 +374,7 @@ extern "C" {
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
@ -381,6 +382,7 @@ extern "C" {
LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos);

View file

@ -89,8 +89,7 @@ int main(int argc, char **argv) {
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
if(i != 3)
return 2;
return 2;
}
}
@ -100,10 +99,11 @@ int main(int argc, char **argv) {
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
if(cp != 0 && cp != 9601)
if(cp != 9601) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 3;
}
}
}
}