diff --git a/examples/server/README.md b/examples/server/README.md index e5ca8269b..0b562b3ac 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -126,7 +126,7 @@ node . `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. + `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f442f2b56..567dc4d2d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -188,6 +188,7 @@ struct llama_server_context size_t n_past = 0; size_t n_remain = 0; + json prompt; std::vector embd; std::vector last_n_tokens; @@ -257,10 +258,55 @@ struct llama_server_context return true; } + std::vector tokenizePrompt(void) + { + std::vector prompt_tokens; + + if (prompt.is_array()) + { + bool first = true; + for (const auto& p : prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + s.insert(0, 1, ' '); // add a space if it's the first + p = ::llama_tokenize(ctx, s, true); // also add BOS + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = prompt.template get(); + s.insert(0, 1, ' '); // always add a first space + prompt_tokens = ::llama_tokenize(ctx, s, true); + } + + return prompt_tokens; + } + void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + auto prompt_tokens = tokenizePrompt(); + num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) @@ -954,7 +1000,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"tokens_predicted", llama.num_tokens_predicted}, {"tokens_evaluated", llama.num_prompt_tokens}, {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, + {"prompt", llama.prompt}, {"truncated", llama.truncated}, {"stopped_eos", llama.stopped_eos}, {"stopped_word", llama.stopped_word}, @@ -1015,8 +1061,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.seed = body.value("seed", default_params.seed); - llama.params.prompt = body.value("prompt", default_params.prompt); llama.params.n_probs = body.value("n_probs", default_params.n_probs); + llama.prompt = body["prompt"]; llama.params.logit_bias.clear(); if (body.value("ignore_eos", false)) @@ -1258,8 +1304,8 @@ int main(int argc, char **argv) auto lock = llama.lock(); const json body = json::parse(req.body); - const std::string content = body.value("content", ""); - const std::vector tokens = llama_tokenize(llama.ctx, content, false); + llama.prompt = body["content"]; + const std::vector tokens = llama.tokenizePrompt(); const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); @@ -1271,7 +1317,7 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - llama.params.prompt = body.value("content", ""); + llama.prompt = body["content"]; llama.params.n_predict = 0; llama.loadPrompt(); llama.beginCompletion();