server : code style

2024-03-05 15:36:14 +02:00 · 2024-03-05 15:36:14 +02:00 · fef64c587d
commit fef64c587d
parent ad1d746caa
3 changed files with 737 additions and 940 deletions
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@ -12,9 +12,8 @@ using json = nlohmann::json;

 inline static json oaicompat_completion_params_parse(
    const struct llama_model * model,
-    const json &body, /* openai api json semantics */
-    const std::string &chat_template)
-{
+    const json & body, /* openai api json semantics */
+    const std::string & chat_template) {
    json llama_params;

    llama_params["__oaicompat"] = true;
@ -27,26 +26,26 @@ inline static json oaicompat_completion_params_parse(
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
-    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
-    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
+    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
+    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
+    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
+    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
+    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
+    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
+    llama_params["stream"]            = json_value(body,   "stream",            false);
+    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
+    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
+    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
+    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
+    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
+    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
+    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);

    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
@ -65,8 +64,7 @@ inline static json oaicompat_completion_params_parse(
    return llama_params;
 }

-inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
+inline static json format_final_response_oaicompat(const json & request, const task_result & response, bool streaming = false) {
    json result = response.result_json;

    bool stopped_word        = result.count("stopped_word") != 0;
@ -91,17 +89,19 @@ inline static json format_final_response_oaicompat(const json &request, const ta

    std::time_t t = std::time(0);

-    json res =
-        json{{"choices", choices},
-            {"created", t},
-            {"model",
-                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-            {"usage",
-                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
-            {"id", gen_chatcmplid()}};
+    json res = json {
+        {"choices", choices},
+        {"created", t},
+        {"model",
+            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+        {"usage", json {
+            {"completion_tokens", num_tokens_predicted},
+            {"prompt_tokens",     num_prompt_tokens},
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+        }},
+        {"id", gen_chatcmplid()}
+    };

    if (server_verbose) {
        res["__verbose"] = result;
@ -125,10 +125,10 @@ inline static std::vector<json> format_partial_response_oaicompat(const task_res
    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));

-    bool stopped_word   = json_value(result, "stopped_word", false);
-    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_word   = json_value(result, "stopped_word",  false);
+    bool stopped_eos    = json_value(result, "stopped_eos",   false);
    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content", std::string(""));
+    std::string content = json_value(result, "content",       std::string(""));

    std::string finish_reason;
    if (stopped_word || stopped_eos) {
@ -196,26 +196,28 @@ inline static std::vector<json> format_partial_response_oaicompat(const task_res
        }
    }

-    json ret = json{{"choices", choices},
-                    {"created", t},
-                    {"id", gen_chatcmplid()},
-                    {"model", modelname},
-                    {"object", "chat.completion.chunk"}};
+    json ret = json {
+        {"choices", choices},
+        {"created", t},
+        {"id",      gen_chatcmplid()},
+        {"model",   modelname},
+        {"object",  "chat.completion.chunk"}
+    };

    return std::vector<json>({ret});
 }

-inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
-{
-    json res =
-        json{
-            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", "list"},
-            {"usage",
-                json{{"prompt_tokens", 0},
-                     {"total_tokens", 0}}},
-            {"data", embeddings}
-        };
+inline static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", 0},
+            {"total_tokens", 0}
+        }},
+        {"data", embeddings}
+    };
+
    return res;
 }

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -58,8 +58,8 @@ struct task_server {
    task_type type;
    json data;

-    bool infill_mode    = false;
-    bool embedding_mode = false;
+    bool infill    = false;
+    bool embedding = false;
 };

 struct task_result {
@ -187,7 +187,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }

-    std::string formatted_chat(buf.data(), res);
+    const std::string formatted_chat(buf.data(), res);
+
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});

    return formatted_chat;
@ -201,17 +202,18 @@ struct llama_server_queue {
    int id = 0;
    bool running;

-    std::mutex mutex_tasks;
-
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
+    std::vector<task_multi>  queue_multitasks;
+
+    std::mutex mutex_tasks;
    std::condition_variable condition_tasks;
+
    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
+    std::function<void(task_server &)> callback_new_task;
+    std::function<void(task_multi &)>  callback_finish_multitask;
+    std::function<void(void)>          callback_run_slots;

    // Add a new task to the end of the queue
    int post(task_server task) {
@ -265,10 +267,9 @@ struct llama_server_queue {
    }

    // end the start_loop routine
-    void terminate() { {
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            running = false;
-        }
+    void terminate() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        running = false;
        condition_tasks.notify_all();
    }

@ -350,14 +351,11 @@ struct llama_server_queue {
    }

    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int id_multi, int subtask_id, task_result& result)
-    {
+    void update_multitask(int id_multi, int id_sub, task_result& result) {
        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == id_multi)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
+        for (auto & multitask : queue_multitasks) {
+            if (multitask.id == id_multi) {
+                multitask.subtasks_remaining.erase(id_sub);
                multitask.results.push_back(result);
            }
        }
@ -468,13 +466,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str

    std::vector<uint8_t> ret;

-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
+        if (i == 4) {
+            for (i = 0; i < 4; i++) {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }

@ -482,23 +477,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];

-            for (i = 0; (i < 3); i++)
-            {
+            for (i = 0; (i < 3); i++) {
                ret.push_back(char_array_3[i]);
            }
+
            i = 0;
        }
    }

-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
+    if (i) {
+        for (j = i; j < 4; j++) {
            char_array_4[j] = 0;
        }

-        for (j = 0; j <4; j++)
-        {
+        for (j = 0; j < 4; j++) {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }

@ -506,8 +498,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];

-        for (j = 0; (j < i - 1); j++)
-        {
+        for (j = 0; j < i - 1; j++) {
            ret.push_back(char_array_3[j]);
        }
    }
@ -586,6 +577,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
@ -601,6 +593,7 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
    json out = json::array();
+
    for (const auto & prob : probs) {
        json probs_for_token = json::array();