server support for system, prefix, and suffix prompts with special tokens

2023-10-22 21:45:30 -04:00 · 2023-10-22 21:45:30 -04:00 · 5872e4f4da
commit 5872e4f4da
parent 96981f37b1
2 changed files with 1813 additions and 1686 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -200,6 +200,24 @@ node index.js

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

+    `system'      : Set the system prompt added before text prompt (arrays).  It is independent of system_prompt above
+                    and should not be used together with it.
+
+    `input_prefix`: Set the prefix added to input text prompt lines.
+
+    `input_suffix`: Set the suffix added to input text prompt lines.
+
+     The system, input_prefix, and input_suffix are tokenized with special
+     tokens required by some models to work correctly.  Using these three
+     prompts enables the server API to support a full externally accumulated
+     chat history toggling between user inputs and generated outputs line by
+     line with the desired system header, input_prefix, and input_suffix to
+     delineate user and genrated lines, without relying on any context memory
+     in the server.  In order for this to work right, input prompts must
+     not have any hard lfs so the prompt array toggles between user input
+     and generated output every line.  Hard lfs in input prompts need to
+     be replaced with ascii \n sequence or space.
+
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
@ -208,6 +226,14 @@ node index.js

    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

+-   **POST** `/tokenizes`: Tokenize a given text with special tokens.
+
+    *Options:*
+
+    `content`: Set the text to tokenize with special tokens.
+
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
+
 -   **POST** `/detokenize`: Convert tokens to text.

    *Options:*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -183,6 +183,7 @@ struct slot_params

    std::vector<std::string> antiprompt;

+    json system;
    json input_prefix;
    json input_suffix;
 };
@ -603,38 +604,83 @@ struct llama_server_context
 	system_tokens.clear();
    }

-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(const json & json_prompt,
+				      bool add_bos, bool special=false,
+				      const json & json_system=NULL,
+				      const json & json_prefix=NULL,
+				      const json & json_suffix=NULL) const
    {
 	// If `add_bos` is true, we only add BOS, when json_prompt is a string,
 	// or the first element of the json_prompt array is a string.
 	std::vector<llama_token> prompt_tokens;

+	// to support short term learning from chat context,
+	// first line in array is query, next line
+	// is generated text, next line is next query ...
+	bool user_input = true;
+
+	// don't add sys/prefix/suffix if not a normal tokenize
+	bool add_params = add_bos;
+
+	std::string params_system="",params_input_prefix="",params_input_suffix="";
+	if (json_system != NULL)
+	   if (json_system.is_string())
+	      params_system = json_system.template get<std::string>();
+	if (json_prefix != NULL)
+	   if (json_prefix.is_string())
+	      params_input_prefix = json_prefix.template get<std::string>();
+	if (json_suffix != NULL)
+	   if (json_suffix.is_string())
+	      params_input_suffix = json_suffix.template get<std::string>();
+  
+	if (add_params && (params_system.size() > 1)) {
+	   // add the system prompt before the conversation input
+	   LOG("system: '%s'\n", params_system.c_str());
+	   std::vector<llama_token> system;
+	   system = ::llama_tokenize(ctx,params_system,false,true);
+	   prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
+	   LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
+	   }
+ 
 	if (json_prompt.is_array())
 	{
-            bool first = true;
 	    for (const auto& p : json_prompt)
 	    {
 		if (p.is_string())
 		{
 		    auto s = p.template get<std::string>();
 		    std::vector<llama_token> p;
-                    if (first)
-                    {
-                        p = ::llama_tokenize(ctx, s, add_bos);
-                        first = false;
-                    }
-                    else
-                    {
-                        p = ::llama_tokenize(ctx, s, false);
+
+		    if (add_params && user_input && (params_input_prefix.size() > 1)) {
+			LOG("input prefix: '%s'\n", params_input_prefix.c_str());
+			std::vector<llama_token> line_pfx;
+			line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
+			prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
+			LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
+ 
+			// bos has been added
+			add_bos = false;
 			}
+
+		    p = ::llama_tokenize(ctx, s, add_bos, special);
+		    // bos has been added
+		    add_bos = false;
+ 
+		    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
 		    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+ 
+		    if (add_params && user_input && (params_input_suffix.size() > 1)) {
+			LOG("input suffix: '%s'\n", params_input_suffix.c_str());
+			std::vector<llama_token> line_sfx;
+			line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
+			LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
+			prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
+			}
+ 
+		    user_input = !user_input;
 		}
 		else
 		{
-                    if (first)
-                    {
-                        first = false;
-                    }
 		    prompt_tokens.push_back(p.template get<llama_token>());
 		}
 	    }
@ -642,7 +688,33 @@ struct llama_server_context
 	else
 	{
 	    auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+
+	    std::vector<llama_token> p;
+ 
+	    if (add_params && (params_input_prefix.size() > 1)) {
+	       LOG("input prefix: '%s'\n", params_input_prefix.c_str());
+	       std::vector<llama_token> line_pfx;
+	       line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
+	       prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
+	       LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
+	       // bos has been added
+	       add_bos = false;
+	       }
+ 
+	    p = ::llama_tokenize(ctx, s, add_bos, special);
+	    add_bos = false;
+ 
+	    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
+	    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+ 
+	    // Add the suffix if defined
+	    if (add_params && (params_input_suffix.size() > 1)) {
+	       LOG("input suffix: '%s'\n", params_input_suffix.c_str());
+	       std::vector<llama_token> line_sfx;
+	       line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
+	       LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
+	       prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
+	       }
 	}

 	return prompt_tokens;
@ -694,7 +766,17 @@ struct llama_server_context
 	slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
 	slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);

-        // infill
+	// system prompt
+	if (data.count("system") != 0)
+	{
+	    slot->params.system = data["system"];
+	}
+	else
+	{
+	    slot->params.system = "";
+	}
+
+	// infill, prompt prefix/suffix
 	if (data.count("input_prefix") != 0)
 	{
 	    slot->params.input_prefix = data["input_prefix"];
@ -1150,6 +1232,9 @@ struct llama_server_context
 	    {"mirostat_tau",      slot.sparams.mirostat_tau},
 	    {"mirostat_eta",      slot.sparams.mirostat_eta},
 	    {"penalize_nl",       slot.sparams.penalize_nl},
+	    {"system",            slot.params.system},
+	    {"input_prefix",      slot.params.input_prefix},
+	    {"input_suffix",      slot.params.input_suffix},
 	    {"stop",              slot.params.antiprompt},
 	    {"n_predict",         slot.params.n_predict},
 	    {"n_keep",            params.n_keep},
@ -1564,7 +1649,11 @@ struct llama_server_context
 		    }
 		    else
 		    {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
+			prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
+						 false,
+						 slot.params.system,
+						 slot.params.input_prefix,
+						 slot.params.input_suffix);  // add BOS if there isn't system prompt
 		    }

 		    slot.num_prompt_tokens = prompt_tokens.size();
@ -2414,6 +2503,18 @@ int main(int argc, char **argv)
 		return res.set_content(data.dump(), "application/json");
 	    });

+    svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
+	    {
+		const json body = json::parse(req.body);
+		std::vector<llama_token> tokens;
+		if (body.count("content") != 0)
+		{
+		    tokens = llama.tokenize(body["content"], false, true);
+		}
+		const json data = format_tokenizer_response(tokens);
+		return res.set_content(data.dump(), "application/json");
+	    });
+
    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
 	    {
 		const json body = json::parse(req.body);