server support for system, prefix, and suffix prompts with special tokens

This commit is contained in:
Wile E. Coyote 2023-10-22 21:45:30 -04:00
parent 96981f37b1
commit 5872e4f4da
2 changed files with 1813 additions and 1686 deletions

View file

@ -200,6 +200,24 @@ node index.js
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`system' : Set the system prompt added before text prompt (arrays). It is independent of system_prompt above
and should not be used together with it.
`input_prefix`: Set the prefix added to input text prompt lines.
`input_suffix`: Set the suffix added to input text prompt lines.
The system, input_prefix, and input_suffix are tokenized with special
tokens required by some models to work correctly. Using these three
prompts enables the server API to support a full externally accumulated
chat history toggling between user inputs and generated outputs line by
line with the desired system header, input_prefix, and input_suffix to
delineate user and genrated lines, without relying on any context memory
in the server. In order for this to work right, input prompts must
not have any hard lfs so the prompt array toggles between user input
and generated output every line. Hard lfs in input prompts need to
be replaced with ascii \n sequence or space.
- **POST** `/tokenize`: Tokenize a given text. - **POST** `/tokenize`: Tokenize a given text.
*Options:* *Options:*
@ -208,6 +226,14 @@ node index.js
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/tokenizes`: Tokenize a given text with special tokens.
*Options:*
`content`: Set the text to tokenize with special tokens.
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/detokenize`: Convert tokens to text. - **POST** `/detokenize`: Convert tokens to text.
*Options:* *Options:*

View file

@ -183,6 +183,7 @@ struct slot_params
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
json system;
json input_prefix; json input_prefix;
json input_suffix; json input_suffix;
}; };
@ -603,38 +604,83 @@ struct llama_server_context
system_tokens.clear(); system_tokens.clear();
} }
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const std::vector<llama_token> tokenize(const json & json_prompt,
bool add_bos, bool special=false,
const json & json_system=NULL,
const json & json_prefix=NULL,
const json & json_suffix=NULL) const
{ {
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens; std::vector<llama_token> prompt_tokens;
// to support short term learning from chat context,
// first line in array is query, next line
// is generated text, next line is next query ...
bool user_input = true;
// don't add sys/prefix/suffix if not a normal tokenize
bool add_params = add_bos;
std::string params_system="",params_input_prefix="",params_input_suffix="";
if (json_system != NULL)
if (json_system.is_string())
params_system = json_system.template get<std::string>();
if (json_prefix != NULL)
if (json_prefix.is_string())
params_input_prefix = json_prefix.template get<std::string>();
if (json_suffix != NULL)
if (json_suffix.is_string())
params_input_suffix = json_suffix.template get<std::string>();
if (add_params && (params_system.size() > 1)) {
// add the system prompt before the conversation input
LOG("system: '%s'\n", params_system.c_str());
std::vector<llama_token> system;
system = ::llama_tokenize(ctx,params_system,false,true);
prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
}
if (json_prompt.is_array()) if (json_prompt.is_array())
{ {
bool first = true;
for (const auto& p : json_prompt) for (const auto& p : json_prompt)
{ {
if (p.is_string()) if (p.is_string())
{ {
auto s = p.template get<std::string>(); auto s = p.template get<std::string>();
std::vector<llama_token> p; std::vector<llama_token> p;
if (first)
{ if (add_params && user_input && (params_input_prefix.size() > 1)) {
p = ::llama_tokenize(ctx, s, add_bos); LOG("input prefix: '%s'\n", params_input_prefix.c_str());
first = false; std::vector<llama_token> line_pfx;
} line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
else prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
{ LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
p = ::llama_tokenize(ctx, s, false);
// bos has been added
add_bos = false;
} }
p = ::llama_tokenize(ctx, s, add_bos, special);
// bos has been added
add_bos = false;
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
if (add_params && user_input && (params_input_suffix.size() > 1)) {
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
std::vector<llama_token> line_sfx;
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
}
user_input = !user_input;
} }
else else
{ {
if (first)
{
first = false;
}
prompt_tokens.push_back(p.template get<llama_token>()); prompt_tokens.push_back(p.template get<llama_token>());
} }
} }
@ -642,7 +688,33 @@ struct llama_server_context
else else
{ {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
std::vector<llama_token> p;
if (add_params && (params_input_prefix.size() > 1)) {
LOG("input prefix: '%s'\n", params_input_prefix.c_str());
std::vector<llama_token> line_pfx;
line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
// bos has been added
add_bos = false;
}
p = ::llama_tokenize(ctx, s, add_bos, special);
add_bos = false;
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
// Add the suffix if defined
if (add_params && (params_input_suffix.size() > 1)) {
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
std::vector<llama_token> line_sfx;
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
}
} }
return prompt_tokens; return prompt_tokens;
@ -694,7 +766,17 @@ struct llama_server_context
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
// infill // system prompt
if (data.count("system") != 0)
{
slot->params.system = data["system"];
}
else
{
slot->params.system = "";
}
// infill, prompt prefix/suffix
if (data.count("input_prefix") != 0) if (data.count("input_prefix") != 0)
{ {
slot->params.input_prefix = data["input_prefix"]; slot->params.input_prefix = data["input_prefix"];
@ -1150,6 +1232,9 @@ struct llama_server_context
{"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta}, {"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl}, {"penalize_nl", slot.sparams.penalize_nl},
{"system", slot.params.system},
{"input_prefix", slot.params.input_prefix},
{"input_suffix", slot.params.input_suffix},
{"stop", slot.params.antiprompt}, {"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict}, {"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep}, {"n_keep", params.n_keep},
@ -1564,7 +1649,11 @@ struct llama_server_context
} }
else else
{ {
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
false,
slot.params.system,
slot.params.input_prefix,
slot.params.input_suffix); // add BOS if there isn't system prompt
} }
slot.num_prompt_tokens = prompt_tokens.size(); slot.num_prompt_tokens = prompt_tokens.size();
@ -2414,6 +2503,18 @@ int main(int argc, char **argv)
return res.set_content(data.dump(), "application/json"); return res.set_content(data.dump(), "application/json");
}); });
svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
{
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
if (body.count("content") != 0)
{
tokens = llama.tokenize(body["content"], false, true);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json");
});
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{ {
const json body = json::parse(req.body); const json body = json::parse(req.body);