server support for system, prefix, and suffix prompts with special tokens
This commit is contained in:
parent
96981f37b1
commit
5872e4f4da
2 changed files with 1813 additions and 1686 deletions
|
@ -200,6 +200,24 @@ node index.js
|
||||||
|
|
||||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||||
|
|
||||||
|
`system' : Set the system prompt added before text prompt (arrays). It is independent of system_prompt above
|
||||||
|
and should not be used together with it.
|
||||||
|
|
||||||
|
`input_prefix`: Set the prefix added to input text prompt lines.
|
||||||
|
|
||||||
|
`input_suffix`: Set the suffix added to input text prompt lines.
|
||||||
|
|
||||||
|
The system, input_prefix, and input_suffix are tokenized with special
|
||||||
|
tokens required by some models to work correctly. Using these three
|
||||||
|
prompts enables the server API to support a full externally accumulated
|
||||||
|
chat history toggling between user inputs and generated outputs line by
|
||||||
|
line with the desired system header, input_prefix, and input_suffix to
|
||||||
|
delineate user and genrated lines, without relying on any context memory
|
||||||
|
in the server. In order for this to work right, input prompts must
|
||||||
|
not have any hard lfs so the prompt array toggles between user input
|
||||||
|
and generated output every line. Hard lfs in input prompts need to
|
||||||
|
be replaced with ascii \n sequence or space.
|
||||||
|
|
||||||
- **POST** `/tokenize`: Tokenize a given text.
|
- **POST** `/tokenize`: Tokenize a given text.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
@ -208,6 +226,14 @@ node index.js
|
||||||
|
|
||||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
|
- **POST** `/tokenizes`: Tokenize a given text with special tokens.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`content`: Set the text to tokenize with special tokens.
|
||||||
|
|
||||||
|
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
- **POST** `/detokenize`: Convert tokens to text.
|
- **POST** `/detokenize`: Convert tokens to text.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
|
@ -183,6 +183,7 @@ struct slot_params
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
|
|
||||||
|
json system;
|
||||||
json input_prefix;
|
json input_prefix;
|
||||||
json input_suffix;
|
json input_suffix;
|
||||||
};
|
};
|
||||||
|
@ -603,38 +604,83 @@ struct llama_server_context
|
||||||
system_tokens.clear();
|
system_tokens.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
std::vector<llama_token> tokenize(const json & json_prompt,
|
||||||
|
bool add_bos, bool special=false,
|
||||||
|
const json & json_system=NULL,
|
||||||
|
const json & json_prefix=NULL,
|
||||||
|
const json & json_suffix=NULL) const
|
||||||
{
|
{
|
||||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
|
||||||
|
// to support short term learning from chat context,
|
||||||
|
// first line in array is query, next line
|
||||||
|
// is generated text, next line is next query ...
|
||||||
|
bool user_input = true;
|
||||||
|
|
||||||
|
// don't add sys/prefix/suffix if not a normal tokenize
|
||||||
|
bool add_params = add_bos;
|
||||||
|
|
||||||
|
std::string params_system="",params_input_prefix="",params_input_suffix="";
|
||||||
|
if (json_system != NULL)
|
||||||
|
if (json_system.is_string())
|
||||||
|
params_system = json_system.template get<std::string>();
|
||||||
|
if (json_prefix != NULL)
|
||||||
|
if (json_prefix.is_string())
|
||||||
|
params_input_prefix = json_prefix.template get<std::string>();
|
||||||
|
if (json_suffix != NULL)
|
||||||
|
if (json_suffix.is_string())
|
||||||
|
params_input_suffix = json_suffix.template get<std::string>();
|
||||||
|
|
||||||
|
if (add_params && (params_system.size() > 1)) {
|
||||||
|
// add the system prompt before the conversation input
|
||||||
|
LOG("system: '%s'\n", params_system.c_str());
|
||||||
|
std::vector<llama_token> system;
|
||||||
|
system = ::llama_tokenize(ctx,params_system,false,true);
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
|
||||||
|
LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
if (json_prompt.is_array())
|
if (json_prompt.is_array())
|
||||||
{
|
{
|
||||||
bool first = true;
|
|
||||||
for (const auto& p : json_prompt)
|
for (const auto& p : json_prompt)
|
||||||
{
|
{
|
||||||
if (p.is_string())
|
if (p.is_string())
|
||||||
{
|
{
|
||||||
auto s = p.template get<std::string>();
|
auto s = p.template get<std::string>();
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
|
||||||
{
|
if (add_params && user_input && (params_input_prefix.size() > 1)) {
|
||||||
p = ::llama_tokenize(ctx, s, add_bos);
|
LOG("input prefix: '%s'\n", params_input_prefix.c_str());
|
||||||
first = false;
|
std::vector<llama_token> line_pfx;
|
||||||
}
|
line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
|
||||||
else
|
prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
|
||||||
{
|
LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
|
||||||
p = ::llama_tokenize(ctx, s, false);
|
|
||||||
|
// bos has been added
|
||||||
|
add_bos = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
p = ::llama_tokenize(ctx, s, add_bos, special);
|
||||||
|
// bos has been added
|
||||||
|
add_bos = false;
|
||||||
|
|
||||||
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
|
||||||
|
if (add_params && user_input && (params_input_suffix.size() > 1)) {
|
||||||
|
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
|
||||||
|
std::vector<llama_token> line_sfx;
|
||||||
|
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
|
||||||
|
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
user_input = !user_input;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (first)
|
|
||||||
{
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
prompt_tokens.push_back(p.template get<llama_token>());
|
prompt_tokens.push_back(p.template get<llama_token>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -642,7 +688,33 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
|
||||||
|
std::vector<llama_token> p;
|
||||||
|
|
||||||
|
if (add_params && (params_input_prefix.size() > 1)) {
|
||||||
|
LOG("input prefix: '%s'\n", params_input_prefix.c_str());
|
||||||
|
std::vector<llama_token> line_pfx;
|
||||||
|
line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
|
||||||
|
LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
|
||||||
|
// bos has been added
|
||||||
|
add_bos = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
p = ::llama_tokenize(ctx, s, add_bos, special);
|
||||||
|
add_bos = false;
|
||||||
|
|
||||||
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
|
||||||
|
// Add the suffix if defined
|
||||||
|
if (add_params && (params_input_suffix.size() > 1)) {
|
||||||
|
LOG("input suffix: '%s'\n", params_input_suffix.c_str());
|
||||||
|
std::vector<llama_token> line_sfx;
|
||||||
|
line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
|
||||||
|
LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
|
||||||
|
prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -694,7 +766,17 @@ struct llama_server_context
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
// infill
|
// system prompt
|
||||||
|
if (data.count("system") != 0)
|
||||||
|
{
|
||||||
|
slot->params.system = data["system"];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
slot->params.system = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// infill, prompt prefix/suffix
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
{
|
{
|
||||||
slot->params.input_prefix = data["input_prefix"];
|
slot->params.input_prefix = data["input_prefix"];
|
||||||
|
@ -1150,6 +1232,9 @@ struct llama_server_context
|
||||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||||
{"penalize_nl", slot.sparams.penalize_nl},
|
{"penalize_nl", slot.sparams.penalize_nl},
|
||||||
|
{"system", slot.params.system},
|
||||||
|
{"input_prefix", slot.params.input_prefix},
|
||||||
|
{"input_suffix", slot.params.input_suffix},
|
||||||
{"stop", slot.params.antiprompt},
|
{"stop", slot.params.antiprompt},
|
||||||
{"n_predict", slot.params.n_predict},
|
{"n_predict", slot.params.n_predict},
|
||||||
{"n_keep", params.n_keep},
|
{"n_keep", params.n_keep},
|
||||||
|
@ -1564,7 +1649,11 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
|
||||||
|
false,
|
||||||
|
slot.params.system,
|
||||||
|
slot.params.input_prefix,
|
||||||
|
slot.params.input_suffix); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens = prompt_tokens.size();
|
slot.num_prompt_tokens = prompt_tokens.size();
|
||||||
|
@ -2414,6 +2503,18 @@ int main(int argc, char **argv)
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
|
{
|
||||||
|
const json body = json::parse(req.body);
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
if (body.count("content") != 0)
|
||||||
|
{
|
||||||
|
tokens = llama.tokenize(body["content"], false, true);
|
||||||
|
}
|
||||||
|
const json data = format_tokenizer_response(tokens);
|
||||||
|
return res.set_content(data.dump(), "application/json");
|
||||||
|
});
|
||||||
|
|
||||||
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue