remove server.hpp

2024-12-05 16:04:36 +01:00 · 2024-12-05 16:04:36 +01:00 · 1cf769be67
commit 1cf769be67
parent 8ab173c865
3 changed files with 675 additions and 695 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,5 +1,4 @@
 #include "utils.hpp"
 #include "server.hpp"
 #include "arg.h"
 #include "common.h"
@ -33,9 +32,682 @@
 using json = nlohmann::ordered_json;
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
    STOP_TYPE_WORD,
    STOP_TYPE_LIMIT,
 };
 // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
    SLOT_STATE_PROCESSING_PROMPT,
    SLOT_STATE_DONE_PROMPT,
    SLOT_STATE_GENERATING,
 };
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 enum server_task_type {
    SERVER_TASK_TYPE_INFERENCE,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 enum server_task_inf_type {
    SERVER_TASK_INF_TYPE_COMPLETION,
    SERVER_TASK_INF_TYPE_EMBEDDING,
    SERVER_TASK_INF_TYPE_RERANK,
    SERVER_TASK_INF_TYPE_INFILL,
 };
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
    ERROR_TYPE_AUTHENTICATION,
    ERROR_TYPE_SERVER,
    ERROR_TYPE_NOT_FOUND,
    ERROR_TYPE_PERMISSION,
    ERROR_TYPE_UNAVAILABLE, // custom error
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 struct server_task {
    int id        = -1; // to be filled by server_queue
    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
    llama_tokens prompt_tokens;
    server_task_type type;
    // TODO @ngxson : we should get rid of json type here
    json data;
    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
    // utility function
    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
        }
        return ids;
    }
 };
 struct slot_params {
    bool stream       = true;
    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    bool timings_per_token = false;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
    // params only used in to_json()
    int32_t n_ctx;
    uint32_t seed_cur;
    bool can_speculative;
    // OAI-compat fields
    bool oaicompat = false;
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    json to_json() {
        std::vector<std::string> samplers;
        samplers.reserve(sampling.samplers.size());
        for (const auto & sampler : sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }
        return json {
            {"n_ctx",                     n_ctx},
            {"n_predict",                 n_predict},     // Server configured n_predict
            {"temperature",               sampling.temp},
            {"dynatemp_range",            sampling.dynatemp_range},
            {"dynatemp_exponent",         sampling.dynatemp_exponent},
            {"top_k",                     sampling.top_k},
            {"top_p",                     sampling.top_p},
            {"min_p",                     sampling.min_p},
            {"xtc_probability",           sampling.xtc_probability},
            {"xtc_threshold",             sampling.xtc_threshold},
            {"typical_p",                 sampling.typ_p},
            {"repeat_last_n",             sampling.penalty_last_n},
            {"repeat_penalty",            sampling.penalty_repeat},
            {"presence_penalty",          sampling.penalty_present},
            {"frequency_penalty",         sampling.penalty_freq},
            {"dry_multiplier",            sampling.dry_multiplier},
            {"dry_base",                  sampling.dry_base},
            {"dry_allowed_length",        sampling.dry_allowed_length},
            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
            {"mirostat",                  sampling.mirostat},
            {"mirostat_tau",              sampling.mirostat_tau},
            {"mirostat_eta",              sampling.mirostat_eta},
            {"penalize_nl",               sampling.penalize_nl},
            {"stop",                      antiprompt},
            {"max_tokens",                n_predict}, // User configured n_predict
            {"n_keep",                    n_keep},
            {"n_discard",                 n_discard},
            {"ignore_eos",                sampling.ignore_eos},
            {"stream",                    stream},
            //{"logit_bias",                sampling.logit_bias},
            {"n_probs",                   sampling.n_probs},
            {"min_keep",                  sampling.min_keep},
            {"grammar",                   sampling.grammar},
            {"samplers",                  samplers},
            {"speculative",               can_speculative},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
            {"speculative.p_min",         speculative.p_min},
            {"timings_per_token",         timings_per_token},
        };
    }
 };
 struct result_timings {
    int32_t prompt_n = -1;
    double prompt_ms;
    double prompt_per_token_ms;
    double prompt_per_second;
    int32_t predicted_n = -1;
    double predicted_ms;
    double predicted_per_token_ms;
    double predicted_per_second;
    json to_json() {
        return {
            {"prompt_n",               prompt_n},
            {"prompt_ms",              prompt_ms},
            {"prompt_per_token_ms",    prompt_per_token_ms},
            {"prompt_per_second",      prompt_per_second},
            {"predicted_n",            predicted_n},
            {"predicted_ms",           predicted_ms},
            {"predicted_per_token_ms", predicted_per_token_ms},
            {"predicted_per_second",   predicted_per_second},
        };
    }
 };
 struct server_task_result {
    int id           = -1;
    int id_slot      = -1;
    virtual bool is_error() {
        // only used by server_task_result_error
        return false;
    }
    virtual bool is_stop() {
        // only used by server_task_result_cmpl_partial
        return false;
    }
    virtual int get_index() {
        return -1;
    }
    virtual json to_json() = 0;
    virtual json to_json_oai_compat() {
        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
        return json();
    }
    virtual ~server_task_result() = default;
 };
 // using shared_ptr for polymorphism of server_task_result
 using task_result_ptr = std::unique_ptr<server_task_result>;
 inline std::string stop_type_to_str(stop_type type) {
    switch (type) {
        case STOP_TYPE_EOS:   return "eos";
        case STOP_TYPE_WORD:  return "word";
        case STOP_TYPE_LIMIT: return "limit";
        default:              return "none";
    }
 }
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
    struct token_prob {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    json to_json(const llama_context * ctx) const {
        json probs_for_token = json::array();
        for (const auto & p : probs) {
            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
            probs_for_token.push_back(json {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
        return probs_for_token;
    }
    static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
        json out = json::array();
        for (const auto & prob : probs) {
            const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
            out.push_back(json {
                {"content", tok_str},
                {"probs",   prob.to_json(ctx)},
            });
        }
        return out;
    }
 };
 struct server_task_result_cmpl_final : server_task_result {
    int index = 0;
    std::string content;
    bool stream;
    result_timings timings;
    std::string prompt;
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    int32_t n_tokens_cached;
    int32_t has_new_line;
    std::string stopping_word;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    slot_params generation_params;
    // OAI-compat fields
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        // non-OAI-compat JSON
        return json {
            {"index",               index},
            {"content",             content},
            {"id_slot",             id_slot},
            {"stop",                true},
            {"model",               oaicompat_model},
            {"tokens_predicted",    n_decoded},
            {"tokens_evaluated",    n_prompt_tokens},
            {"generation_settings", generation_params.to_json()},
            {"prompt",              prompt},
            {"has_new_line",        has_new_line},
            {"truncated",           truncated},
            {"stop_type",           stop_type_to_str(stop)},
            {"stopping_word",       stopping_word},
            {"tokens_cached",       n_tokens_cached},
            {"timings",             timings.to_json()},
        };
    }
    virtual json to_json_oai_compat() override {
        std::string finish_reason = "length";
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        }
        json choices = json::array({json{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"message", json{
                {"content", content},
                {"role", "assistant"}
            }
        }}});
        std::time_t t = std::time(0);
        json res = json {
            {"choices", choices},
            {"created", t},
            {"model", oaicompat_model},
            {"object", "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id", oaicompat_cmpl_id}
        };
        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json();
        }
        // TODO: fix this
        // if (result.contains("completion_probabilities")) {
        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
        // }
        if (timings.prompt_n >= 0) {
            res.push_back({"timings", timings.to_json()});
        }
        return res;
    }
 };
 struct server_task_result_cmpl_partial : server_task_result {
    int index = 0;
    std::string content;
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    result_timings timings;
    // OAI-compat fields
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    virtual int get_index() override {
        return index;
    }
    virtual bool is_stop() override {
        return stop != STOP_TYPE_NONE;
    }
    virtual json to_json() override {
        bool is_stop = stop != STOP_TYPE_NONE;
        // non-OAI-compat JSON
        json res = json {
            {"index",            index},
            {"content",          content},
            {"stop_type",        stop_type_to_str(stop)},
            {"stop",             is_stop},
            {"id_slot",          id_slot},
            {"tokens_predicted", n_decoded},
            {"tokens_evaluated", n_prompt_tokens},
        };
        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
        if (timings.prompt_n > 0) {
            res.push_back({"timings", timings.to_json()});
        }
        if (is_stop) {
            res.push_back({"truncated", truncated});
        }
        return res;
    }
    virtual json to_json_oai_compat() override {
        bool first = n_decoded == 0;
        std::string finish_reason;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        } else if (stop == STOP_TYPE_LIMIT) {
            finish_reason = "length";
        }
        std::time_t t = std::time(0);
        json choices;
        if (!finish_reason.empty()) {
            choices = json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}});
        } else {
            if (first) {
                if (content.empty()) {
                    choices = json::array({json{{"finish_reason", nullptr},
                                                {"index", 0},
                                                {"delta", json{{"role", "assistant"}}}}});
                } else {
                    // We have to send this as two updates to conform to openai behavior
                    json initial_ret = json{{"choices", json::array({json{
                                            {"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{
                                                {"role", "assistant"}
                                            }}}})},
                                {"created", t},
                                {"id", oaicompat_cmpl_id},
                                {"model", oaicompat_model},
                                {"object", "chat.completion.chunk"}};
                    json second_ret = json{
                                {"choices", json::array({json{{"finish_reason", nullptr},
                                                                {"index", 0},
                                                                {"delta", json{
                                                                {"content", content}}}
                                                                }})},
                                {"created", t},
                                {"id", oaicompat_cmpl_id},
                                {"model", oaicompat_model},
                                {"object", "chat.completion.chunk"}};
                    return std::vector<json>({initial_ret, second_ret});
                }
            } else {
                // Some idiosyncrasy in task processing logic makes several trailing calls
                // with empty content, we ignore these at the calee site.
                if (content.empty()) {
                    return std::vector<json>({json::object()});
                }
                choices = json::array({json{
                    {"finish_reason", nullptr},
                    {"index", 0},
                    {"delta",
                    json{
                        {"content", content},
                    }},
                }});
            }
        }
        json ret = json {
            {"choices", choices},
            {"created", t},
            {"id",      oaicompat_cmpl_id},
            {"model",   oaicompat_model},
            {"object",  "chat.completion.chunk"}
        };
        if (timings.prompt_n >= 0) {
            ret.push_back({"timings", timings.to_json()});
        }
        if (!finish_reason.empty()) {
            ret.push_back({"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens},
            }});
        }
        return std::vector<json>({ret});
    }
 };
 struct server_task_result_embd : server_task_result {
    int index = 0;
    std::vector<float> embedding;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        return json {
            {"index",     index},
            {"embedding", embedding},
        };
    }
 };
 struct server_task_result_rerank : server_task_result {
    int index = 0;
    float score = -1e6;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        return json {
            {"index", index},
            {"score", score},
        };
    }
 };
 // this function maybe used outside of server_task_result_error
 static json format_error_response(const std::string & message, const enum error_type type) {
    std::string type_str;
    int code = 500;
    switch (type) {
        case ERROR_TYPE_INVALID_REQUEST:
            type_str = "invalid_request_error";
            code = 400;
            break;
        case ERROR_TYPE_AUTHENTICATION:
            type_str = "authentication_error";
            code = 401;
            break;
        case ERROR_TYPE_NOT_FOUND:
            type_str = "not_found_error";
            code = 404;
            break;
        case ERROR_TYPE_SERVER:
            type_str = "server_error";
            code = 500;
            break;
        case ERROR_TYPE_PERMISSION:
            type_str = "permission_error";
            code = 403;
            break;
        case ERROR_TYPE_NOT_SUPPORTED:
            type_str = "not_supported_error";
            code = 501;
            break;
        case ERROR_TYPE_UNAVAILABLE:
            type_str = "unavailable_error";
            code = 503;
            break;
    }
    return json {
        {"code", code},
        {"message", message},
        {"type", type_str},
    };
 }
 struct server_task_result_error : server_task_result {
    int index = 0;
    error_type err_type = ERROR_TYPE_SERVER;
    std::string err_msg;
    virtual bool is_error() override {
        return true;
    }
    virtual json to_json() override {
        return format_error_response(err_msg, err_type);
    }
 };
 struct server_task_result_metrics : server_task_result {
    int n_idle_slots;
    int n_processing_slots;
    int n_tasks_deferred;
    int64_t t_start;
    int32_t kv_cache_tokens_count;
    int32_t kv_cache_used_cells;
    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
    uint64_t n_prompt_tokens_processed_total = 0;
    uint64_t t_prompt_processing_total       = 0;
    uint64_t n_tokens_predicted_total        = 0;
    uint64_t t_tokens_generation_total       = 0;
    uint64_t n_prompt_tokens_processed = 0;
    uint64_t t_prompt_processing       = 0;
    uint64_t n_tokens_predicted  = 0;
    uint64_t t_tokens_generation = 0;
    uint64_t n_decode_total     = 0;
    uint64_t n_busy_slots_total = 0;
    // TODO: get rid of this json object and use to_json() instead
    json slots_data = json::array();
    virtual json to_json() override {
        return json {
            { "idle",                            n_idle_slots },
            { "processing",                      n_processing_slots },
            { "deferred",                        n_tasks_deferred },
            { "t_start",                         t_start },
            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
            { "t_tokens_generation_total",       t_tokens_generation_total },
            { "n_tokens_predicted_total",        n_tokens_predicted_total },
            { "t_prompt_processing_total",       t_prompt_processing_total },
            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
            { "t_prompt_processing",             t_prompt_processing },
            { "n_tokens_predicted",              n_tokens_predicted },
            { "t_tokens_generation",             t_tokens_generation },
            { "n_decode_total",                  n_decode_total },
            { "n_busy_slots_total",              n_busy_slots_total },
            { "kv_cache_tokens_count",           kv_cache_tokens_count },
            { "kv_cache_used_cells",             kv_cache_used_cells },
            { "slots",                           slots_data },
        };
    }
 };
 struct server_task_result_slot_save_load : server_task_result {
    std::string filename;
    bool is_save; // true = save, false = load
    size_t n_tokens;
    size_t n_bytes;
    double t_ms;
    virtual json to_json() override {
        if (is_save) {
            return json {
                { "id_slot",   id_slot },
                { "filename",  filename },
                { "n_saved",   n_tokens },
                { "n_written", n_bytes },
                { "timings", {
                    { "save_ms", t_ms }
                }},
            };
        } else {
            return json {
                { "id_slot",    id_slot },
                { "filename",   filename },
                { "n_restored", n_tokens },
                { "n_read",     n_bytes },
                { "timings", {
                    { "restore_ms", t_ms }
                }},
            };
        }
    }
 };
 struct server_task_result_slot_erase : server_task_result {
    size_t n_erased;
    virtual json to_json() override {
        return json {
            { "id_slot",  id_slot },
            { "n_erased", n_erased },
        };
    }
 };
 struct server_task_result_apply_lora : server_task_result {
    virtual json to_json() override {
        return json {{ "success", true }};
    }
 };
 struct server_slot {
    int id;
    int id_task = -1;
@ -786,8 +1458,9 @@ struct server_context {
        const auto & data = task.data;
        if (data.count("__oaicompat") != 0) {
            std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
            slot.params.oaicompat         = true;
-            slot.params.oaicompat_model   = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            slot.params.oaicompat_model   = json_value(data, "model", model_name);
            slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string());
        } else {
            slot.params.oaicompat         = false;
@ -2835,8 +3508,6 @@ int main(int argc, char ** argv) {
        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
        std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
        return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true);
    };
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@ -1,665 +0,0 @@
 #pragma once
 #include "common.h"
 #include "llama.h"
 #include "sampling.h"
 #include "speculative.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include <string>
 #include <memory>
 #include <unordered_set>
 using json = nlohmann::ordered_json;
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
    STOP_TYPE_WORD,
    STOP_TYPE_LIMIT,
 };
 // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
    SLOT_STATE_PROCESSING_PROMPT,
    SLOT_STATE_DONE_PROMPT,
    SLOT_STATE_GENERATING,
 };
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 enum server_task_type {
    SERVER_TASK_TYPE_INFERENCE,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 enum server_task_inf_type {
    SERVER_TASK_INF_TYPE_COMPLETION,
    SERVER_TASK_INF_TYPE_EMBEDDING,
    SERVER_TASK_INF_TYPE_RERANK,
    SERVER_TASK_INF_TYPE_INFILL,
 };
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
    ERROR_TYPE_AUTHENTICATION,
    ERROR_TYPE_SERVER,
    ERROR_TYPE_NOT_FOUND,
    ERROR_TYPE_PERMISSION,
    ERROR_TYPE_UNAVAILABLE, // custom error
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 struct server_task {
    int id        = -1; // to be filled by server_queue
    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
    llama_tokens prompt_tokens;
    server_task_type type;
    // TODO @ngxson : we should get rid of json type here
    json data;
    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
    // utility function
    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
        }
        return ids;
    }
 };
 struct slot_params {
    bool stream       = true;
    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    bool timings_per_token = false;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
    // params only used in to_json()
    int32_t n_ctx;
    uint32_t seed_cur;
    bool can_speculative;
    // OAI-compat fields
    bool oaicompat = false;
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    json to_json() {
        std::vector<std::string> samplers;
        samplers.reserve(sampling.samplers.size());
        for (const auto & sampler : sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }
        return json {
            {"n_ctx",                     n_ctx},
            {"n_predict",                 n_predict},     // Server configured n_predict
            {"temperature",               sampling.temp},
            {"dynatemp_range",            sampling.dynatemp_range},
            {"dynatemp_exponent",         sampling.dynatemp_exponent},
            {"top_k",                     sampling.top_k},
            {"top_p",                     sampling.top_p},
            {"min_p",                     sampling.min_p},
            {"xtc_probability",           sampling.xtc_probability},
            {"xtc_threshold",             sampling.xtc_threshold},
            {"typical_p",                 sampling.typ_p},
            {"repeat_last_n",             sampling.penalty_last_n},
            {"repeat_penalty",            sampling.penalty_repeat},
            {"presence_penalty",          sampling.penalty_present},
            {"frequency_penalty",         sampling.penalty_freq},
            {"dry_multiplier",            sampling.dry_multiplier},
            {"dry_base",                  sampling.dry_base},
            {"dry_allowed_length",        sampling.dry_allowed_length},
            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
            {"mirostat",                  sampling.mirostat},
            {"mirostat_tau",              sampling.mirostat_tau},
            {"mirostat_eta",              sampling.mirostat_eta},
            {"penalize_nl",               sampling.penalize_nl},
            {"stop",                      antiprompt},
            {"max_tokens",                n_predict}, // User configured n_predict
            {"n_keep",                    n_keep},
            {"n_discard",                 n_discard},
            {"ignore_eos",                sampling.ignore_eos},
            {"stream",                    stream},
            //{"logit_bias",                sampling.logit_bias},
            {"n_probs",                   sampling.n_probs},
            {"min_keep",                  sampling.min_keep},
            {"grammar",                   sampling.grammar},
            {"samplers",                  samplers},
            {"speculative",               can_speculative},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
            {"speculative.p_min",         speculative.p_min},
            {"timings_per_token",         timings_per_token},
        };
    }
 };
 struct result_timings {
    int32_t prompt_n = -1;
    double prompt_ms;
    double prompt_per_token_ms;
    double prompt_per_second;
    int32_t predicted_n = -1;
    double predicted_ms;
    double predicted_per_token_ms;
    double predicted_per_second;
    json to_json() {
        return {
            {"prompt_n",               prompt_n},
            {"prompt_ms",              prompt_ms},
            {"prompt_per_token_ms",    prompt_per_token_ms},
            {"prompt_per_second",      prompt_per_second},
            {"predicted_n",            predicted_n},
            {"predicted_ms",           predicted_ms},
            {"predicted_per_token_ms", predicted_per_token_ms},
            {"predicted_per_second",   predicted_per_second},
        };
    }
 };
 struct server_task_result {
    int id           = -1;
    int id_slot      = -1;
    virtual bool is_error() {
        // only used by server_task_result_error
        return false;
    }
    virtual bool is_stop() {
        // only used by server_task_result_cmpl_partial
        return false;
    }
    virtual int get_index() {
        return -1;
    }
    virtual json to_json() = 0;
    virtual json to_json_oai_compat() {
        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
        return json();
    }
    virtual ~server_task_result() = default;
 };
 inline std::string stop_type_to_str(stop_type type) {
    switch (type) {
        case STOP_TYPE_EOS:   return "eos";
        case STOP_TYPE_WORD:  return "word";
        case STOP_TYPE_LIMIT: return "limit";
        default:              return "none";
    }
 }
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
    struct token_prob {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
 };
 struct server_task_result_cmpl_final : server_task_result {
    int index = 0;
    std::string content;
    bool stream;
    result_timings timings;
    std::string prompt;
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    int32_t n_tokens_cached;
    int32_t has_new_line;
    std::string stopping_word;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    slot_params generation_params;
    // OAI-compat fields
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        // non-OAI-compat JSON
        return json {
            {"index",               index},
            {"content",             content},
            {"id_slot",             id_slot},
            {"stop",                true},
            {"model",               oaicompat_model},
            {"tokens_predicted",    n_decoded},
            {"tokens_evaluated",    n_prompt_tokens},
            {"generation_settings", generation_params.to_json()},
            {"prompt",              prompt},
            {"has_new_line",        has_new_line},
            {"truncated",           truncated},
            {"stop_type",           stop_type_to_str(stop)},
            {"stopping_word",       stopping_word},
            {"tokens_cached",       n_tokens_cached},
            {"timings",             timings.to_json()},
        };
    }
    virtual json to_json_oai_compat() override {
        std::string finish_reason = "length";
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        }
        json choices = json::array({json{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"message", json{
                {"content", content},
                {"role", "assistant"}
            }
        }}});
        std::time_t t = std::time(0);
        json res = json {
            {"choices", choices},
            {"created", t},
            {"model", oaicompat_model},
            {"object", "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id", oaicompat_cmpl_id}
        };
        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json();
        }
        // TODO: fix this
        // if (result.contains("completion_probabilities")) {
        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
        // }
        if (timings.prompt_n >= 0) {
            res.push_back({"timings", timings.to_json()});
        }
        return res;
    }
 };
 struct server_task_result_cmpl_partial : server_task_result {
    int index = 0;
    std::string content;
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    stop_type stop = STOP_TYPE_NONE;
    std::vector<completion_token_output> probs_output;
    result_timings timings;
    // OAI-compat fields
    std::string oaicompat_model;
    std::string oaicompat_cmpl_id;
    bool verbose = false;
    virtual int get_index() override {
        return index;
    }
    virtual bool is_stop() override {
        return stop != STOP_TYPE_NONE;
    }
    virtual json to_json() override {
        bool is_stop = stop != STOP_TYPE_NONE;
        // non-OAI-compat JSON
        json res = json {
            {"index",            index},
            {"content",          content},
            {"stop_type",        stop_type_to_str(stop)},
            {"stop",             is_stop},
            {"id_slot",          id_slot},
            {"tokens_predicted", n_decoded},
            {"tokens_evaluated", n_prompt_tokens},
        };
        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
        if (timings.prompt_n > 0) {
            res.push_back({"timings", timings.to_json()});
        }
        if (is_stop) {
            res.push_back({"truncated", truncated});
        }
        return res;
    }
    virtual json to_json_oai_compat() override {
        bool first = n_decoded == 0;
        std::string finish_reason;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        } else if (stop == STOP_TYPE_LIMIT) {
            finish_reason = "length";
        }
        std::time_t t = std::time(0);
        json choices;
        if (!finish_reason.empty()) {
            choices = json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}});
        } else {
            if (first) {
                if (content.empty()) {
                    choices = json::array({json{{"finish_reason", nullptr},
                                                {"index", 0},
                                                {"delta", json{{"role", "assistant"}}}}});
                } else {
                    // We have to send this as two updates to conform to openai behavior
                    json initial_ret = json{{"choices", json::array({json{
                                            {"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{
                                                {"role", "assistant"}
                                            }}}})},
                                {"created", t},
                                {"id", oaicompat_cmpl_id},
                                {"model", oaicompat_model},
                                {"object", "chat.completion.chunk"}};
                    json second_ret = json{
                                {"choices", json::array({json{{"finish_reason", nullptr},
                                                                {"index", 0},
                                                                {"delta", json{
                                                                {"content", content}}}
                                                                }})},
                                {"created", t},
                                {"id", oaicompat_cmpl_id},
                                {"model", oaicompat_model},
                                {"object", "chat.completion.chunk"}};
                    return std::vector<json>({initial_ret, second_ret});
                }
            } else {
                // Some idiosyncrasy in task processing logic makes several trailing calls
                // with empty content, we ignore these at the calee site.
                if (content.empty()) {
                    return std::vector<json>({json::object()});
                }
                choices = json::array({json{
                    {"finish_reason", nullptr},
                    {"index", 0},
                    {"delta",
                    json{
                        {"content", content},
                    }},
                }});
            }
        }
        json ret = json {
            {"choices", choices},
            {"created", t},
            {"id",      oaicompat_cmpl_id},
            {"model",   oaicompat_model},
            {"object",  "chat.completion.chunk"}
        };
        if (timings.prompt_n >= 0) {
            ret.push_back({"timings", timings.to_json()});
        }
        if (!finish_reason.empty()) {
            ret.push_back({"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens},
            }});
        }
        return std::vector<json>({ret});
    }
 };
 struct server_task_result_embd : server_task_result {
    int index = 0;
    std::vector<float> embedding;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        return json {
            {"index",     index},
            {"embedding", embedding},
        };
    }
 };
 struct server_task_result_rerank : server_task_result {
    int index = 0;
    float score = -1e6;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
        return json {
            {"index", index},
            {"score", score},
        };
    }
 };
 // this function maybe used outside of server_task_result_error
 static json format_error_response(const std::string & message, const enum error_type type) {
    std::string type_str;
    int code = 500;
    switch (type) {
        case ERROR_TYPE_INVALID_REQUEST:
            type_str = "invalid_request_error";
            code = 400;
            break;
        case ERROR_TYPE_AUTHENTICATION:
            type_str = "authentication_error";
            code = 401;
            break;
        case ERROR_TYPE_NOT_FOUND:
            type_str = "not_found_error";
            code = 404;
            break;
        case ERROR_TYPE_SERVER:
            type_str = "server_error";
            code = 500;
            break;
        case ERROR_TYPE_PERMISSION:
            type_str = "permission_error";
            code = 403;
            break;
        case ERROR_TYPE_NOT_SUPPORTED:
            type_str = "not_supported_error";
            code = 501;
            break;
        case ERROR_TYPE_UNAVAILABLE:
            type_str = "unavailable_error";
            code = 503;
            break;
    }
    return json {
        {"code", code},
        {"message", message},
        {"type", type_str},
    };
 }
 struct server_task_result_error : server_task_result {
    int index = 0;
    error_type err_type = ERROR_TYPE_SERVER;
    std::string err_msg;
    virtual bool is_error() override {
        return true;
    }
    virtual json to_json() override {
        return format_error_response(err_msg, err_type);
    }
 };
 struct server_task_result_metrics : server_task_result {
    int n_idle_slots;
    int n_processing_slots;
    int n_tasks_deferred;
    int64_t t_start;
    int32_t kv_cache_tokens_count;
    int32_t kv_cache_used_cells;
    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
    uint64_t n_prompt_tokens_processed_total = 0;
    uint64_t t_prompt_processing_total       = 0;
    uint64_t n_tokens_predicted_total        = 0;
    uint64_t t_tokens_generation_total       = 0;
    uint64_t n_prompt_tokens_processed = 0;
    uint64_t t_prompt_processing       = 0;
    uint64_t n_tokens_predicted  = 0;
    uint64_t t_tokens_generation = 0;
    uint64_t n_decode_total     = 0;
    uint64_t n_busy_slots_total = 0;
    // TODO: get rid of this json object and use to_json() instead
    json slots_data = json::array();
    virtual json to_json() override {
        return json {
            { "idle",                            n_idle_slots },
            { "processing",                      n_processing_slots },
            { "deferred",                        n_tasks_deferred },
            { "t_start",                         t_start },
            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
            { "t_tokens_generation_total",       t_tokens_generation_total },
            { "n_tokens_predicted_total",        n_tokens_predicted_total },
            { "t_prompt_processing_total",       t_prompt_processing_total },
            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
            { "t_prompt_processing",             t_prompt_processing },
            { "n_tokens_predicted",              n_tokens_predicted },
            { "t_tokens_generation",             t_tokens_generation },
            { "n_decode_total",                  n_decode_total },
            { "n_busy_slots_total",              n_busy_slots_total },
            { "kv_cache_tokens_count",           kv_cache_tokens_count },
            { "kv_cache_used_cells",             kv_cache_used_cells },
            { "slots",                           slots_data },
        };
    }
 };
 struct server_task_result_slot_save_load : server_task_result {
    std::string filename;
    bool is_save; // true = save, false = load
    size_t n_tokens;
    size_t n_bytes;
    double t_ms;
    virtual json to_json() override {
        if (is_save) {
            return json {
                { "id_slot",   id_slot },
                { "filename",  filename },
                { "n_saved",   n_tokens },
                { "n_written", n_bytes },
                { "timings", {
                    { "save_ms", t_ms }
                }},
            };
        } else {
            return json {
                { "id_slot",    id_slot },
                { "filename",   filename },
                { "n_restored", n_tokens },
                { "n_read",     n_bytes },
                { "timings", {
                    { "restore_ms", t_ms }
                }},
            };
        }
    }
 };
 struct server_task_result_slot_erase : server_task_result {
    size_t n_erased;
    virtual json to_json() override {
        return json {
            { "id_slot",  id_slot },
            { "n_erased", n_erased },
        };
    }
 };
 struct server_task_result_apply_lora : server_task_result {
    virtual json to_json() override {
        return json {{ "success", true }};
    }
 };
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -3,7 +3,6 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include "server.hpp"
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@ -476,31 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
    return out;
 }
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
    json out = json::array();
    for (const auto & prob : probs) {
        json probs_for_token = json::array();
        for (const auto & p : prob.probs) {
            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
            probs_for_token.push_back(json {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
        out.push_back(json {
            {"content", tok_str},
            {"probs",   probs_for_token},
        });
    }
    return out;
 }
 static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
    const std::string str =
        std::string(event) + ": " +