Added special token support to llama_tokenize() calls in server.cpp
This commit is contained in:
parent
e3932593d4
commit
09ff755ecc
1 changed files with 6 additions and 6 deletions
|
@ -620,12 +620,12 @@ struct llama_server_context
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
if (first)
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, add_bos);
|
p = ::llama_tokenize(ctx, s, add_bos, true);
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, false);
|
p = ::llama_tokenize(ctx, s, false, true);
|
||||||
}
|
}
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
}
|
}
|
||||||
|
@ -642,7 +642,7 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -861,7 +861,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_system_prompt() {
|
void update_system_prompt() {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, true, true);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -1180,7 +1180,7 @@ struct llama_server_context
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
{
|
{
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false, true);
|
||||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
||||||
if (probs_pos < probs_stop_pos)
|
if (probs_pos < probs_stop_pos)
|
||||||
|
@ -1226,7 +1226,7 @@ struct llama_server_context
|
||||||
std::vector<completion_token_output> probs = {};
|
std::vector<completion_token_output> probs = {};
|
||||||
if (!slot.params.stream && slot.stopped_word)
|
if (!slot.params.stream && slot.stopped_word)
|
||||||
{
|
{
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false, true);
|
||||||
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue