server support for system, prefix, and suffix prompts with special tokens
This commit is contained in:
parent
96981f37b1
commit
5872e4f4da
2 changed files with 1813 additions and 1686 deletions
|
@ -200,6 +200,24 @@ node index.js
|
|||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
`system' : Set the system prompt added before text prompt (arrays). It is independent of system_prompt above
|
||||
and should not be used together with it.
|
||||
|
||||
`input_prefix`: Set the prefix added to input text prompt lines.
|
||||
|
||||
`input_suffix`: Set the suffix added to input text prompt lines.
|
||||
|
||||
The system, input_prefix, and input_suffix are tokenized with special
|
||||
tokens required by some models to work correctly. Using these three
|
||||
prompts enables the server API to support a full externally accumulated
|
||||
chat history toggling between user inputs and generated outputs line by
|
||||
line with the desired system header, input_prefix, and input_suffix to
|
||||
delineate user and genrated lines, without relying on any context memory
|
||||
in the server. In order for this to work right, input prompts must
|
||||
not have any hard lfs so the prompt array toggles between user input
|
||||
and generated output every line. Hard lfs in input prompts need to
|
||||
be replaced with ascii \n sequence or space.
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
*Options:*
|
||||
|
@ -208,6 +226,14 @@ node index.js
|
|||
|
||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
|
||||
- **POST** `/tokenizes`: Tokenize a given text with special tokens.
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to tokenize with special tokens.
|
||||
|
||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
|
||||
- **POST** `/detokenize`: Convert tokens to text.
|
||||
|
||||
*Options:*
|
||||
|
|
|
@ -183,6 +183,7 @@ struct slot_params
|
|||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
json system;
|
||||
json input_prefix;
|
||||
json input_suffix;
|
||||
};
|
||||
|
@ -603,38 +604,83 @@ struct llama_server_context
|
|||
system_tokens.clear();
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||
std::vector<llama_token> tokenize(const json & json_prompt,
|
||||
bool add_bos, bool special=false,
|
||||
const json & json_system=NULL,
|
||||
const json & json_prefix=NULL,
|
||||
const json & json_suffix=NULL) const
|
||||
{
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
||||
// to support short term learning from chat context,
|
||||
// first line in array is query, next line
|
||||
// is generated text, next line is next query ...
|
||||
bool user_input = true;
|
||||
|
||||
// don't add sys/prefix/suffix if not a normal tokenize
|
||||
bool add_params = add_bos;
|
||||
|
||||
std::string params_system="",params_input_prefix="",params_input_suffix="";
|
||||
if (json_system != NULL)
|
||||
if (json_system.is_string())
|
||||
params_system = json_system.template get<std::string>();
|
||||
if (json_prefix != NULL)
|
||||
if (json_prefix.is_string())
|
||||
params_input_prefix = json_prefix.template get<std::string>();
|
||||
if (json_suffix != NULL)
|
||||
if (json_suffix.is_string())
|
||||
params_input_suffix = json_suffix.template get<std::string>();
|
||||
|
||||
if (add_params && (params_system.size() > 1)) {
|
||||
// add the system prompt before the conversation input
|
||||
LOG("system: '%s'\n", params_system.c_str());
|
||||
std::vector<llama_token> system;
|
||||
system = ::llama_tokenize(ctx,params_system,false,true);
|
||||
prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
|
||||
LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
|
||||
}
|
||||
|
||||
if (json_prompt.is_array())
|
||||
{
|
||||
bool first = true;
|
||||
for (const auto& p : json_prompt)
|
||||
{
|
||||
if (p.is_string())
|
||||
{
|
||||
auto s = p.template get<std::string>();
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, add_bos);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, false);
|
||||
|
||||
if (add_params && user_input && (params_input_prefix.size() > 1)) {
|
||||
LOG("input prefix: '%s'\n", params_input_prefix.c_str());
|
||||
std::vector<llama_token> line_pfx;
|
||||
line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
|
||||
prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
|
||||
LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
|
||||
|
||||
// bos has been added
|
||||
add_bos = false;
|
||||
}
|
||||
|
||||
p = ::llama_tokenize(ctx, s, add_bos, special);
|
||||
// bos has been added
|
||||
add_bos = false;
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
|
||||
if (add_params && user_input && (params_input_suffix.size() > 1)) {
|
||||
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
|
||||
std::vector<llama_token> line_sfx;
|
||||
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
|
||||
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
|
||||
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
|
||||
}
|
||||
|
||||
user_input = !user_input;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
}
|
||||
}
|
||||
|
@ -642,7 +688,33 @@ struct llama_server_context
|
|||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||
|
||||
std::vector<llama_token> p;
|
||||
|
||||
if (add_params && (params_input_prefix.size() > 1)) {
|
||||
LOG("input prefix: '%s'\n", params_input_prefix.c_str());
|
||||
std::vector<llama_token> line_pfx;
|
||||
line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
|
||||
prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
|
||||
LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
|
||||
// bos has been added
|
||||
add_bos = false;
|
||||
}
|
||||
|
||||
p = ::llama_tokenize(ctx, s, add_bos, special);
|
||||
add_bos = false;
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
|
||||
// Add the suffix if defined
|
||||
if (add_params && (params_input_suffix.size() > 1)) {
|
||||
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
|
||||
std::vector<llama_token> line_sfx;
|
||||
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
|
||||
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
|
||||
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
|
||||
}
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
|
@ -694,7 +766,17 @@ struct llama_server_context
|
|||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
|
||||
// infill
|
||||
// system prompt
|
||||
if (data.count("system") != 0)
|
||||
{
|
||||
slot->params.system = data["system"];
|
||||
}
|
||||
else
|
||||
{
|
||||
slot->params.system = "";
|
||||
}
|
||||
|
||||
// infill, prompt prefix/suffix
|
||||
if (data.count("input_prefix") != 0)
|
||||
{
|
||||
slot->params.input_prefix = data["input_prefix"];
|
||||
|
@ -1150,6 +1232,9 @@ struct llama_server_context
|
|||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
{"penalize_nl", slot.sparams.penalize_nl},
|
||||
{"system", slot.params.system},
|
||||
{"input_prefix", slot.params.input_prefix},
|
||||
{"input_suffix", slot.params.input_suffix},
|
||||
{"stop", slot.params.antiprompt},
|
||||
{"n_predict", slot.params.n_predict},
|
||||
{"n_keep", params.n_keep},
|
||||
|
@ -1564,7 +1649,11 @@ struct llama_server_context
|
|||
}
|
||||
else
|
||||
{
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
|
||||
false,
|
||||
slot.params.system,
|
||||
slot.params.input_prefix,
|
||||
slot.params.input_suffix); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
|
@ -2414,6 +2503,18 @@ int main(int argc, char **argv)
|
|||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
const json body = json::parse(req.body);
|
||||
std::vector<llama_token> tokens;
|
||||
if (body.count("content") != 0)
|
||||
{
|
||||
tokens = llama.tokenize(body["content"], false, true);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
const json body = json::parse(req.body);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue