server: allow json array in prompt or content
We accept an array of strings and numbers representing tokens, in addition to the current string valued prompt or content. This allows direct token input, so that any special tokens can be processed and used at the frontend during the construction of the json data, before sending to the server. And the server does not need to know or parse special tokens from textual input. With this, we can use EOS and BOS used in llama-2-chat models.
This commit is contained in:
parent
e782c9e735
commit
1a61c1a5e1
2 changed files with 54 additions and 8 deletions
|
@ -126,7 +126,7 @@ node .
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
||||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
|
||||||
|
|
||||||
`stop`: Specify a JSON array of stopping strings.
|
`stop`: Specify a JSON array of stopping strings.
|
||||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||||
|
|
|
@ -188,6 +188,7 @@ struct llama_server_context
|
||||||
size_t n_past = 0;
|
size_t n_past = 0;
|
||||||
size_t n_remain = 0;
|
size_t n_remain = 0;
|
||||||
|
|
||||||
|
json prompt;
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> last_n_tokens;
|
std::vector<llama_token> last_n_tokens;
|
||||||
|
|
||||||
|
@ -257,10 +258,55 @@ struct llama_server_context
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> tokenizePrompt(void)
|
||||||
|
{
|
||||||
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
|
||||||
|
if (prompt.is_array())
|
||||||
|
{
|
||||||
|
bool first = true;
|
||||||
|
for (const auto& p : prompt)
|
||||||
|
{
|
||||||
|
if (p.is_string())
|
||||||
|
{
|
||||||
|
auto s = p.template get<std::string>();
|
||||||
|
std::vector<llama_token> p;
|
||||||
|
if (first)
|
||||||
|
{
|
||||||
|
s.insert(0, 1, ' '); // add a space if it's the first
|
||||||
|
p = ::llama_tokenize(ctx, s, true); // also add BOS
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
p = ::llama_tokenize(ctx, s, false);
|
||||||
|
}
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (first)
|
||||||
|
{
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
prompt_tokens.push_back(p.template get<llama_token>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto s = prompt.template get<std::string>();
|
||||||
|
s.insert(0, 1, ' '); // always add a first space
|
||||||
|
prompt_tokens = ::llama_tokenize(ctx, s, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return prompt_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
void loadPrompt()
|
void loadPrompt()
|
||||||
{
|
{
|
||||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
auto prompt_tokens = tokenizePrompt();
|
||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
||||||
num_prompt_tokens = prompt_tokens.size();
|
num_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (params.n_keep < 0)
|
if (params.n_keep < 0)
|
||||||
|
@ -954,7 +1000,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||||
{"tokens_predicted", llama.num_tokens_predicted},
|
{"tokens_predicted", llama.num_tokens_predicted},
|
||||||
{"tokens_evaluated", llama.num_prompt_tokens},
|
{"tokens_evaluated", llama.num_prompt_tokens},
|
||||||
{"generation_settings", format_generation_settings(llama)},
|
{"generation_settings", format_generation_settings(llama)},
|
||||||
{"prompt", llama.params.prompt},
|
{"prompt", llama.prompt},
|
||||||
{"truncated", llama.truncated},
|
{"truncated", llama.truncated},
|
||||||
{"stopped_eos", llama.stopped_eos},
|
{"stopped_eos", llama.stopped_eos},
|
||||||
{"stopped_word", llama.stopped_word},
|
{"stopped_word", llama.stopped_word},
|
||||||
|
@ -1015,8 +1061,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
|
llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
|
||||||
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
|
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
|
||||||
llama.params.seed = body.value("seed", default_params.seed);
|
llama.params.seed = body.value("seed", default_params.seed);
|
||||||
llama.params.prompt = body.value("prompt", default_params.prompt);
|
|
||||||
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
|
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
|
||||||
|
llama.prompt = body["prompt"];
|
||||||
|
|
||||||
llama.params.logit_bias.clear();
|
llama.params.logit_bias.clear();
|
||||||
if (body.value("ignore_eos", false))
|
if (body.value("ignore_eos", false))
|
||||||
|
@ -1258,8 +1304,8 @@ int main(int argc, char **argv)
|
||||||
auto lock = llama.lock();
|
auto lock = llama.lock();
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
const std::string content = body.value("content", "");
|
llama.prompt = body["content"];
|
||||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
const std::vector<llama_token> tokens = llama.tokenizePrompt();
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json"); });
|
return res.set_content(data.dump(), "application/json"); });
|
||||||
|
|
||||||
|
@ -1271,7 +1317,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
llama.rewind();
|
llama.rewind();
|
||||||
llama_reset_timings(llama.ctx);
|
llama_reset_timings(llama.ctx);
|
||||||
llama.params.prompt = body.value("content", "");
|
llama.prompt = body["content"];
|
||||||
llama.params.n_predict = 0;
|
llama.params.n_predict = 0;
|
||||||
llama.loadPrompt();
|
llama.loadPrompt();
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue