server : (refactoring) reduce usage of json internally

2024-12-03 23:37:03 +01:00 · 2024-12-03 23:37:03 +01:00 · b7d38eef0c
commit b7d38eef0c
parent cc98896db8
3 changed files with 245 additions and 242 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,4 +1,5 @@
 #include "utils.hpp"
 #include "server.hpp"
 #include "arg.h"
 #include "common.h"
@ -32,90 +33,6 @@
 using json = nlohmann::ordered_json;
 enum stop_type {
    STOP_TYPE_FULL,
    STOP_TYPE_PARTIAL,
 };
 // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
    SLOT_STATE_PROCESSING_PROMPT,
    SLOT_STATE_DONE_PROMPT,
    SLOT_STATE_GENERATING,
 };
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 enum server_task_type {
    SERVER_TASK_TYPE_INFERENCE,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 enum server_task_inf_type {
    SERVER_TASK_INF_TYPE_COMPLETION,
    SERVER_TASK_INF_TYPE_EMBEDDING,
    SERVER_TASK_INF_TYPE_RERANK,
    SERVER_TASK_INF_TYPE_INFILL,
 };
 struct server_task {
    int id        = -1; // to be filled by server_queue
    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
    llama_tokens prompt_tokens;
    server_task_type type;
    json data;
    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
    // utility function
    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
        }
        return ids;
    }
 };
 struct server_task_result {
    int id       = -1;
    json data;
    bool stop;
    bool error;
 };
 struct slot_params {
    bool stream       = true;
    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
 };
 struct server_slot {
    int id;
    int id_task = -1;
@ -166,8 +83,6 @@ struct server_slot {
    bool stopped_word   = false;
    bool stopped_limit  = false;
    bool timings_per_token = false;
    bool oaicompat = false;
    std::string oaicompat_model;
@ -255,37 +170,39 @@ struct server_slot {
        }
    }
-    json get_formated_timings() const {
+    result_timings get_timings() const {
-        return json {
+        result_timings timings;
-            {"prompt_n",               n_prompt_tokens_processed},
+        timings.prompt_n = n_prompt_tokens_processed;
-            {"prompt_ms",              t_prompt_processing},
+        timings.prompt_ms = t_prompt_processing;
-            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
-            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
+        timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-            {"predicted_n",            n_decoded},
+        timings.predicted_n = n_decoded;
-            {"predicted_ms",           t_token_generation},
+        timings.predicted_ms = t_token_generation;
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
+        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
-        };
+
        return timings;
    }
-    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
        size_t stop_pos = std::string::npos;
        for (const std::string & word : params.antiprompt) {
            size_t pos;
-            if (type == STOP_TYPE_FULL) {
+            if (is_full_stop) {
                const size_t tmp      = word.size() + last_token_size;
                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
                pos = text.find(word, from_pos);
            } else {
                // otherwise, partial stop
                pos = find_partial_stop_string(word, text);
            }
            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
-                if (type == STOP_TYPE_FULL) {
+                if (is_full_stop) {
                    stopped_word   = true;
                    stopping_word  = word;
                    has_next_token = false;
@ -1108,14 +1025,14 @@ struct server_context {
            const std::string str_test = slot.generated_text.substr(pos);
            bool send_text = true;
-            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
            if (stop_pos != std::string::npos) {
                slot.generated_text.erase(
                    slot.generated_text.begin() + pos + stop_pos,
                    slot.generated_text.end());
                pos = std::min(slot.n_sent_text, slot.generated_text.size());
            } else if (slot.has_next_token) {
-                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
                send_text = stop_pos == std::string::npos;
            }
@ -1229,60 +1146,6 @@ struct server_context {
        return slot.has_next_token; // continue
    }
    json get_formated_generation(const server_slot & slot) const {
        std::vector<std::string> samplers;
        samplers.reserve(slot.params.sampling.samplers.size());
        for (const auto & sampler : slot.params.sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }
        return json {
            {"n_ctx",                     slot.n_ctx},
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params_base.model_alias},
            {"seed",                      slot.params.sampling.seed},
            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.params.sampling.temp},
            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
            {"top_k",                     slot.params.sampling.top_k},
            {"top_p",                     slot.params.sampling.top_p},
            {"min_p",                     slot.params.sampling.min_p},
            {"xtc_probability",           slot.params.sampling.xtc_probability},
            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
            {"typical_p",                 slot.params.sampling.typ_p},
            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
            {"presence_penalty",          slot.params.sampling.penalty_present},
            {"frequency_penalty",         slot.params.sampling.penalty_freq},
            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
            {"dry_base",                  slot.params.sampling.dry_base},
            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
            {"mirostat",                  slot.params.sampling.mirostat},
            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
            {"penalize_nl",               slot.params.sampling.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
            {"ignore_eos",                slot.params.sampling.ignore_eos},
            {"stream",                    slot.params.stream},
          //{"logit_bias",                slot.params.sampling.logit_bias},
            {"n_probs",                   slot.params.sampling.n_probs},
            {"min_keep",                  slot.params.sampling.min_keep},
            {"grammar",                   slot.params.sampling.grammar},
            {"samplers",                  samplers},
            {"speculative",               slot.can_speculate()},
            {"speculative.n_max",         slot.params.speculative.n_max},
            {"speculative.n_min",         slot.params.speculative.n_min},
            {"speculative.p_min",         slot.params.speculative.p_min},
            {"timings_per_token",         slot.timings_per_token},
        };
    }
    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
        send_error(task.id, error, type);
    }
@ -1294,27 +1157,18 @@ struct server_context {
    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
-        server_task_result res;
+        server_task_result_error res;
        res.id       = id_task;
-        res.stop     = false;
+        res.err_type = type;
-        res.error    = true;
+        res.err_msg  = error;
        res.data     = format_error_response(error, type);
        queue_results.send(res);
    }
    void send_partial_response(server_slot & slot, completion_token_output tkn) {
-        server_task_result res;
+        server_task_result_cmpl_partial res;
        res.id       = slot.id_task;
-        res.error    = false;
+        res.content  = tkn.text_to_send;
        res.stop     = false;
        res.data     = json {
            {"content",    tkn.text_to_send},
            {"stop",       false},
            {"id_slot",    slot.id},
            {"multimodal", false},
            {"index",      slot.index},
        };
        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
@ -1323,30 +1177,35 @@ struct server_context {
            std::vector<completion_token_output> probs_output;
            if (probs_pos < probs_stop_pos) {
-                probs_output = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin() + probs_pos,
                        slot.generated_token_probs.begin() + probs_stop_pos);
            }
            slot.n_sent_token_probs = probs_stop_pos;
            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
        }
-        if (slot.oaicompat) {
+        if (slot.params.timings_per_token) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
+            res.timings = slot.get_timings();
            res.data["model"] = slot.oaicompat_model;
        }
        if (slot.timings_per_token) {
            res.data["timings"] = slot.get_formated_timings();
        }
        queue_results.send(res);
    }
    void send_final_response(const server_slot & slot) {
-        server_task_result res;
+        server_task_result_cmpl_final res;
-        res.id       = slot.id_task;
+        res.id              = slot.id_task;
        res.id_slot         = slot.id;
        res.content         = slot.generated_text;
        res.n_decoded       = slot.n_decoded;
        res.n_prompt_tokens = slot.n_prompt_tokens;
        res.has_new_line    = slot.has_new_line;
        res.n_tokens_cached = slot.n_past;
        res.content         = slot.generated_text;
        res.params          = slot.params; // copy the parameters
        res.error    = false;
        res.stop     = true;
        res.data     = json {
@ -1370,36 +1229,27 @@ struct server_context {
        };
        if (slot.params.sampling.n_probs > 0) {
            std::vector<completion_token_output> probs;
            if (!slot.params.stream && slot.stopped_word) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
                        slot.generated_token_probs.end() - safe_offset);
            } else {
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
                        slot.generated_token_probs.end());
            }
            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }
        if (slot.oaicompat) {
            res.data["oaicompat_token_ctr"] = slot.n_decoded;
            res.data["model"] = slot.oaicompat_model;
        }
        queue_results.send(res);
    }
    void send_embedding(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_embd res;
        res.id    = slot.id_task;
-        res.error = false;
+        res.index = slot.index;
        res.stop  = true;
        const int n_embd = llama_n_embd(model);
@ -1418,20 +1268,12 @@ struct server_context {
            if (embd == NULL) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
-                res.data = json {
+                res.embedding = std::vector<float>(n_embd, 0.0f);
                    {"embedding", std::vector<float>(n_embd, 0.0f)},
                    {"index",     slot.index},
                };
                continue;
            }
            common_embd_normalize(embd, embd_res.data(), n_embd);
-
+            res.embedding = embd_res;
            res.data = json {
                {"embedding", embd_res},
                {"index",     slot.index},
            };
        }
        SLT_DBG(slot, "%s", "sending embeddings\n");
@ -1440,10 +1282,9 @@ struct server_context {
    }
    void send_rerank(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_rerank res;
        res.id    = slot.id_task;
-        res.error = false;
+        res.index = slot.index;
        res.stop  = true;
        for (int i = 0; i < batch.n_tokens; ++i) {
            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@ -1458,21 +1299,14 @@ struct server_context {
            if (embd == NULL) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
-                res.data = json {
+                res.score = -1e6;
                    {"index", slot.index},
                    {"score", -1e6},
                };
                continue;
            }
-            res.data = json {
+            res.score = embd[0];
                {"index", slot.index},
                {"score", embd[0]},
            };
        }
-        SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score);
        queue_results.send(res);
    }
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@ -0,0 +1,191 @@
 #pragma once
 #include "common.h"
 #include "llama.h"
 #include "sampling.h"
 #include "speculative.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include <string>
 #include <memory>
 #include <unordered_set>
 using json = nlohmann::ordered_json;
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
    STOP_TYPE_WORD,
    STOP_TYPE_LIMIT,
 };
 // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
    SLOT_STATE_PROCESSING_PROMPT,
    SLOT_STATE_DONE_PROMPT,
    SLOT_STATE_GENERATING,
 };
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 enum server_task_type {
    SERVER_TASK_TYPE_INFERENCE,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 enum server_task_inf_type {
    SERVER_TASK_INF_TYPE_COMPLETION,
    SERVER_TASK_INF_TYPE_EMBEDDING,
    SERVER_TASK_INF_TYPE_RERANK,
    SERVER_TASK_INF_TYPE_INFILL,
 };
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
    ERROR_TYPE_AUTHENTICATION,
    ERROR_TYPE_SERVER,
    ERROR_TYPE_NOT_FOUND,
    ERROR_TYPE_PERMISSION,
    ERROR_TYPE_UNAVAILABLE, // custom error
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 struct server_task {
    int id        = -1; // to be filled by server_queue
    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
    llama_tokens prompt_tokens;
    server_task_type type;
    // TODO @ngxson : we should get rid of json type here
    json data;
    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
    // utility function
    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
        }
        return ids;
    }
 };
 struct result_timings {
    int32_t prompt_n;
    double prompt_ms;
    double prompt_per_token_ms;
    double prompt_per_second;
    int32_t predicted_n;
    double predicted_ms;
    double predicted_per_token_ms;
    double predicted_per_second;
 };
 enum result_type {
    RESULT_TYPE_CMPL_FINAL,
    RESULT_TYPE_CMPL_PARTIAL,
    RESULT_TYPE_EMBD,
    RESULT_TYPE_RERANK,
    RESULT_TYPE_ERROR,
    RESULT_TYPE_UNKNOWN, // will throw an error
 };
 struct server_task_result {
    result_type type = RESULT_TYPE_UNKNOWN;
    int id           = -1;
    int id_slot      = -1;
 };
 struct server_task_result_cmpl_final : server_task_result {
    result_type type = RESULT_TYPE_CMPL_FINAL;
    int index = 0;
    std::string content;
    bool stream;
    bool timings_per_token;
    result_timings timings;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    int32_t has_new_line;
    int32_t stopping_word;
    int32_t n_tokens_cached;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    slot_params params;
 };
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
    struct token_prob {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
 };
 struct server_task_result_cmpl_partial : server_task_result {
    result_type type = RESULT_TYPE_CMPL_PARTIAL;
    int index = 0;
    std::string content;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    result_timings timings;
 };
 struct server_task_result_embd : server_task_result {
    result_type type = RESULT_TYPE_EMBD;
    int index = 0;
    std::vector<float> embedding;
 };
 struct server_task_result_rerank : server_task_result {
    result_type type = RESULT_TYPE_RERANK;
    int index = 0;
    float score;
 };
 struct server_task_result_error : server_task_result {
    result_type type = RESULT_TYPE_ERROR;
    int index = 0;
    error_type err_type;
    std::string err_msg;
 };
 struct slot_params {
    bool stream       = true;
    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    bool timings_per_token = false;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
 };
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include "server.hpp"
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
    ERROR_TYPE_AUTHENTICATION,
    ERROR_TYPE_SERVER,
    ERROR_TYPE_NOT_FOUND,
    ERROR_TYPE_PERMISSION,
    ERROR_TYPE_UNAVAILABLE, // custom error
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
@ -485,18 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
    return out;
 }
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
    struct token_prob {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
 };
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
    json out = json::array();