llama : remove token functions with context args in favor of model (#3720)

* added `llama_model_token_*` variants to all the `llama_token_*` functions.

* added `LLAMA_API`

* formatting

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* removed old `llama_token` functions

* changed 3 more functions to take in model

- `llama_token_get_text`
- `llama_token_get_score`
- `llama_token_get_type`

* added back docs

* fixed main.cpp

* changed token functions to use new model variants

* changed token functions to use new model variants

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Marcus Dunn 2023-10-23 12:40:03 -07:00 committed by GitHub
parent 6336701c93
commit 5be6c803fa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 81 additions and 79 deletions

View file

@ -726,7 +726,7 @@ struct llama_server_context
if (json_value(data, "ignore_eos", false))
{
slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
const auto &logit_bias = data.find("logit_bias");
@ -1056,7 +1056,7 @@ struct llama_server_context
slot.has_next_token = false;
}
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
{
slot.stopped_eos = true;
slot.has_next_token = false;
@ -1130,7 +1130,7 @@ struct llama_server_context
json get_formated_generation(llama_client_slot &slot)
{
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
return json {
@ -1555,11 +1555,11 @@ struct llama_server_context
suffix_tokens.erase(suffix_tokens.begin());
}
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(llama_token_middle(ctx));
prefix_tokens.push_back(llama_token_middle(model));
prompt_tokens = prefix_tokens;
}
else