server support for system, prefix, and suffix prompts with special tokens

2023-10-22 21:45:30 -04:00 · 2023-10-22 21:45:30 -04:00 · 5872e4f4da
commit 5872e4f4da
parent 96981f37b1
2 changed files with 1813 additions and 1686 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -200,6 +200,24 @@ node index.js
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
    `system'      : Set the system prompt added before text prompt (arrays).  It is independent of system_prompt above
                    and should not be used together with it.
    `input_prefix`: Set the prefix added to input text prompt lines.
    `input_suffix`: Set the suffix added to input text prompt lines.
     The system, input_prefix, and input_suffix are tokenized with special
     tokens required by some models to work correctly.  Using these three
     prompts enables the server API to support a full externally accumulated
     chat history toggling between user inputs and generated outputs line by
     line with the desired system header, input_prefix, and input_suffix to
     delineate user and genrated lines, without relying on any context memory
     in the server.  In order for this to work right, input prompts must
     not have any hard lfs so the prompt array toggles between user input
     and generated output every line.  Hard lfs in input prompts need to
     be replaced with ascii \n sequence or space.
 -   **POST** `/tokenize`: Tokenize a given text.
    *Options:*
@ -208,6 +226,14 @@ node index.js
    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
 -   **POST** `/tokenizes`: Tokenize a given text with special tokens.
    *Options:*
    `content`: Set the text to tokenize with special tokens.
    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
 -   **POST** `/detokenize`: Convert tokens to text.
    *Options:*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -183,6 +183,7 @@ struct slot_params
    std::vector<std::string> antiprompt;
    json system;
    json input_prefix;
    json input_suffix;
 };
@ -603,38 +604,83 @@ struct llama_server_context
 	system_tokens.clear();
    }
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(const json & json_prompt,
 				      bool add_bos, bool special=false,
 				      const json & json_system=NULL,
 				      const json & json_prefix=NULL,
 				      const json & json_suffix=NULL) const
    {
 	// If `add_bos` is true, we only add BOS, when json_prompt is a string,
 	// or the first element of the json_prompt array is a string.
 	std::vector<llama_token> prompt_tokens;
 	// to support short term learning from chat context,
 	// first line in array is query, next line
 	// is generated text, next line is next query ...
 	bool user_input = true;
 	// don't add sys/prefix/suffix if not a normal tokenize
 	bool add_params = add_bos;
 	std::string params_system="",params_input_prefix="",params_input_suffix="";
 	if (json_system != NULL)
 	   if (json_system.is_string())
 	      params_system = json_system.template get<std::string>();
 	if (json_prefix != NULL)
 	   if (json_prefix.is_string())
 	      params_input_prefix = json_prefix.template get<std::string>();
 	if (json_suffix != NULL)
 	   if (json_suffix.is_string())
 	      params_input_suffix = json_suffix.template get<std::string>();
 	if (add_params && (params_system.size() > 1)) {
 	   // add the system prompt before the conversation input
 	   LOG("system: '%s'\n", params_system.c_str());
 	   std::vector<llama_token> system;
 	   system = ::llama_tokenize(ctx,params_system,false,true);
 	   prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
 	   LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
 	   }
 	if (json_prompt.is_array())
 	{
            bool first = true;
 	    for (const auto& p : json_prompt)
 	    {
 		if (p.is_string())
 		{
 		    auto s = p.template get<std::string>();
 		    std::vector<llama_token> p;
-                    if (first)
+
-                    {
+		    if (add_params && user_input && (params_input_prefix.size() > 1)) {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+			LOG("input prefix: '%s'\n", params_input_prefix.c_str());
-                        first = false;
+			std::vector<llama_token> line_pfx;
-                    }
+			line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
-                    else
+			prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
-                    {
+			LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
-                        p = ::llama_tokenize(ctx, s, false);
+ 
 			// bos has been added
 			add_bos = false;
 			}
 		    p = ::llama_tokenize(ctx, s, add_bos, special);
 		    // bos has been added
 		    add_bos = false;
 		    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
 		    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
 		    if (add_params && user_input && (params_input_suffix.size() > 1)) {
 			LOG("input suffix: '%s'\n", params_input_suffix.c_str());
 			std::vector<llama_token> line_sfx;
 			line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
 			LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
 			prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
 			}
 		    user_input = !user_input;
 		}
 		else
 		{
                    if (first)
                    {
                        first = false;
                    }
 		    prompt_tokens.push_back(p.template get<llama_token>());
 		}
 	    }
@ -642,7 +688,33 @@ struct llama_server_context
 	else
 	{
 	    auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+
 	    std::vector<llama_token> p;
 	    if (add_params && (params_input_prefix.size() > 1)) {
 	       LOG("input prefix: '%s'\n", params_input_prefix.c_str());
 	       std::vector<llama_token> line_pfx;
 	       line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
 	       prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
 	       LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
 	       // bos has been added
 	       add_bos = false;
 	       }
 	    p = ::llama_tokenize(ctx, s, add_bos, special);
 	    add_bos = false;
 	    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
 	    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
 	    // Add the suffix if defined
 	    if (add_params && (params_input_suffix.size() > 1)) {
 	       LOG("input suffix: '%s'\n", params_input_suffix.c_str());
 	       std::vector<llama_token> line_sfx;
 	       line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
 	       LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
 	       prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
 	       }
 	}
 	return prompt_tokens;
@ -694,7 +766,17 @@ struct llama_server_context
 	slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
 	slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
-        // infill
+	// system prompt
 	if (data.count("system") != 0)
 	{
 	    slot->params.system = data["system"];
 	}
 	else
 	{
 	    slot->params.system = "";
 	}
 	// infill, prompt prefix/suffix
 	if (data.count("input_prefix") != 0)
 	{
 	    slot->params.input_prefix = data["input_prefix"];
@ -1150,6 +1232,9 @@ struct llama_server_context
 	    {"mirostat_tau",      slot.sparams.mirostat_tau},
 	    {"mirostat_eta",      slot.sparams.mirostat_eta},
 	    {"penalize_nl",       slot.sparams.penalize_nl},
 	    {"system",            slot.params.system},
 	    {"input_prefix",      slot.params.input_prefix},
 	    {"input_suffix",      slot.params.input_suffix},
 	    {"stop",              slot.params.antiprompt},
 	    {"n_predict",         slot.params.n_predict},
 	    {"n_keep",            params.n_keep},
@ -1564,7 +1649,11 @@ struct llama_server_context
 		    }
 		    else
 		    {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
+			prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
 						 false,
 						 slot.params.system,
 						 slot.params.input_prefix,
 						 slot.params.input_suffix);  // add BOS if there isn't system prompt
 		    }
 		    slot.num_prompt_tokens = prompt_tokens.size();
@ -2414,6 +2503,18 @@ int main(int argc, char **argv)
 		return res.set_content(data.dump(), "application/json");
 	    });
    svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
 	    {
 		const json body = json::parse(req.body);
 		std::vector<llama_token> tokens;
 		if (body.count("content") != 0)
 		{
 		    tokens = llama.tokenize(body["content"], false, true);
 		}
 		const json data = format_tokenizer_response(tokens);
 		return res.set_content(data.dump(), "application/json");
 	    });
    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
 	    {
 		const json body = json::parse(req.body);