server : (refactoring) reduce usage of json internally

2024-12-03 23:37:03 +01:00 · 2024-12-03 23:37:03 +01:00 · b7d38eef0c
commit b7d38eef0c
parent cc98896db8
3 changed files with 245 additions and 242 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,4 +1,5 @@
 #include "utils.hpp"
+#include "server.hpp"

 #include "arg.h"
 #include "common.h"
@ -32,90 +33,6 @@

 using json = nlohmann::ordered_json;

-enum stop_type {
-    STOP_TYPE_FULL,
-    STOP_TYPE_PARTIAL,
-};
-
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
-enum slot_state {
-    SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
-    SLOT_STATE_PROCESSING_PROMPT,
-    SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_GENERATING,
-};
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
-enum server_task_type {
-    SERVER_TASK_TYPE_INFERENCE,
-    SERVER_TASK_TYPE_CANCEL,
-    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS,
-    SERVER_TASK_TYPE_SLOT_SAVE,
-    SERVER_TASK_TYPE_SLOT_RESTORE,
-    SERVER_TASK_TYPE_SLOT_ERASE,
-    SERVER_TASK_TYPE_SET_LORA,
-};
-
-enum server_task_inf_type {
-    SERVER_TASK_INF_TYPE_COMPLETION,
-    SERVER_TASK_INF_TYPE_EMBEDDING,
-    SERVER_TASK_INF_TYPE_RERANK,
-    SERVER_TASK_INF_TYPE_INFILL,
-};
-
-struct server_task {
-    int id        = -1; // to be filled by server_queue
-    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
-
-    llama_tokens prompt_tokens;
-    server_task_type type;
-    json data;
-
-    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
-
-    // utility function
-    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
-        std::unordered_set<int> ids(tasks.size());
-        for (size_t i = 0; i < tasks.size(); i++) {
-            ids.insert(tasks[i].id);
-        }
-        return ids;
-    }
-};
-
-struct server_task_result {
-    int id       = -1;
-
-    json data;
-
-    bool stop;
-    bool error;
-};
-
-struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
-
-    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
-
-    int64_t t_max_prompt_ms  = -1; // TODO: implement
-    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
-
-    std::vector<std::string> antiprompt;
-
-    struct common_params_sampling sampling;
-    struct common_params_speculative speculative;
-};
-
 struct server_slot {
    int id;
    int id_task = -1;
@ -166,8 +83,6 @@ struct server_slot {
    bool stopped_word   = false;
    bool stopped_limit  = false;

-    bool timings_per_token = false;
-
    bool oaicompat = false;

    std::string oaicompat_model;
@ -255,37 +170,39 @@ struct server_slot {
        }
    }

-    json get_formated_timings() const {
-        return json {
-            {"prompt_n",               n_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
+    result_timings get_timings() const {
+        result_timings timings;
+        timings.prompt_n = n_prompt_tokens_processed;
+        timings.prompt_ms = t_prompt_processing;
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+        timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;

-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
-        };
+        timings.predicted_n = n_decoded;
+        timings.predicted_ms = t_token_generation;
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
+        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
+
+        return timings;
    }

-    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
        size_t stop_pos = std::string::npos;

        for (const std::string & word : params.antiprompt) {
            size_t pos;

-            if (type == STOP_TYPE_FULL) {
+            if (is_full_stop) {
                const size_t tmp      = word.size() + last_token_size;
                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;

                pos = text.find(word, from_pos);
            } else {
+                // otherwise, partial stop
                pos = find_partial_stop_string(word, text);
            }

            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
-                if (type == STOP_TYPE_FULL) {
+                if (is_full_stop) {
                    stopped_word   = true;
                    stopping_word  = word;
                    has_next_token = false;
@ -1108,14 +1025,14 @@ struct server_context {
            const std::string str_test = slot.generated_text.substr(pos);
            bool send_text = true;

-            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
            if (stop_pos != std::string::npos) {
                slot.generated_text.erase(
                    slot.generated_text.begin() + pos + stop_pos,
                    slot.generated_text.end());
                pos = std::min(slot.n_sent_text, slot.generated_text.size());
            } else if (slot.has_next_token) {
-                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
                send_text = stop_pos == std::string::npos;
            }

@ -1229,60 +1146,6 @@ struct server_context {
        return slot.has_next_token; // continue
    }

-    json get_formated_generation(const server_slot & slot) const {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.params.sampling.samplers.size());
-        for (const auto & sampler : slot.params.sampling.samplers) {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
-        }
-
-        return json {
-            {"n_ctx",                     slot.n_ctx},
-            {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params_base.model_alias},
-            {"seed",                      slot.params.sampling.seed},
-            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.params.sampling.temp},
-            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
-            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
-            {"top_k",                     slot.params.sampling.top_k},
-            {"top_p",                     slot.params.sampling.top_p},
-            {"min_p",                     slot.params.sampling.min_p},
-            {"xtc_probability",           slot.params.sampling.xtc_probability},
-            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
-            {"typical_p",                 slot.params.sampling.typ_p},
-            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
-            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
-            {"presence_penalty",          slot.params.sampling.penalty_present},
-            {"frequency_penalty",         slot.params.sampling.penalty_freq},
-            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
-            {"dry_base",                  slot.params.sampling.dry_base},
-            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
-            {"mirostat",                  slot.params.sampling.mirostat},
-            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
-            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
-            {"penalize_nl",               slot.params.sampling.penalize_nl},
-            {"stop",                      slot.params.antiprompt},
-            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
-            {"n_keep",                    slot.params.n_keep},
-            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.params.sampling.ignore_eos},
-            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.params.sampling.logit_bias},
-            {"n_probs",                   slot.params.sampling.n_probs},
-            {"min_keep",                  slot.params.sampling.min_keep},
-            {"grammar",                   slot.params.sampling.grammar},
-            {"samplers",                  samplers},
-            {"speculative",               slot.can_speculate()},
-            {"speculative.n_max",         slot.params.speculative.n_max},
-            {"speculative.n_min",         slot.params.speculative.n_min},
-            {"speculative.p_min",         slot.params.speculative.p_min},
-            {"timings_per_token",         slot.timings_per_token},
-        };
-    }
-
    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
        send_error(task.id, error, type);
    }
@ -1294,27 +1157,18 @@ struct server_context {
    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());

-        server_task_result res;
+        server_task_result_error res;
        res.id       = id_task;
-        res.stop     = false;
-        res.error    = true;
-        res.data     = format_error_response(error, type);
+        res.err_type = type;
+        res.err_msg  = error;

        queue_results.send(res);
    }

    void send_partial_response(server_slot & slot, completion_token_output tkn) {
-        server_task_result res;
+        server_task_result_cmpl_partial res;
        res.id       = slot.id_task;
-        res.error    = false;
-        res.stop     = false;
-        res.data     = json {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"id_slot",    slot.id},
-            {"multimodal", false},
-            {"index",      slot.index},
-        };
+        res.content  = tkn.text_to_send;

        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
@ -1323,30 +1177,35 @@ struct server_context {

            std::vector<completion_token_output> probs_output;
            if (probs_pos < probs_stop_pos) {
-                probs_output = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin() + probs_pos,
                        slot.generated_token_probs.begin() + probs_stop_pos);
            }
-            slot.n_sent_token_probs = probs_stop_pos;
-
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
        }

-        if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
-        }
-
-        if (slot.timings_per_token) {
-            res.data["timings"] = slot.get_formated_timings();
+        if (slot.params.timings_per_token) {
+            res.timings = slot.get_timings();
        }

        queue_results.send(res);
    }

    void send_final_response(const server_slot & slot) {
-        server_task_result res;
-        res.id       = slot.id_task;
+        server_task_result_cmpl_final res;
+        res.id              = slot.id_task;
+        res.id_slot         = slot.id;
+        res.content         = slot.generated_text;
+
+        res.n_decoded       = slot.n_decoded;
+        res.n_prompt_tokens = slot.n_prompt_tokens;
+        res.has_new_line    = slot.has_new_line;
+        res.n_tokens_cached = slot.n_past;
+        res.content         = slot.generated_text;
+
+        res.params          = slot.params; // copy the parameters
+
+
+
        res.error    = false;
        res.stop     = true;
        res.data     = json {
@ -1370,36 +1229,27 @@ struct server_context {
        };

        if (slot.params.sampling.n_probs > 0) {
-            std::vector<completion_token_output> probs;
            if (!slot.params.stream && slot.stopped_word) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);

                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
                        slot.generated_token_probs.end() - safe_offset);
            } else {
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin(),
                        slot.generated_token_probs.end());
            }
-
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
-        }
-
-        if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
        }

        queue_results.send(res);
    }

    void send_embedding(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_embd res;
        res.id    = slot.id_task;
-        res.error = false;
-        res.stop  = true;
+        res.index = slot.index;

        const int n_embd = llama_n_embd(model);

@ -1418,20 +1268,12 @@ struct server_context {
            if (embd == NULL) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);

-                res.data = json {
-                    {"embedding", std::vector<float>(n_embd, 0.0f)},
-                    {"index",     slot.index},
-                };
-
+                res.embedding = std::vector<float>(n_embd, 0.0f);
                continue;
            }

            common_embd_normalize(embd, embd_res.data(), n_embd);
-
-            res.data = json {
-                {"embedding", embd_res},
-                {"index",     slot.index},
-            };
+            res.embedding = embd_res;
        }

        SLT_DBG(slot, "%s", "sending embeddings\n");
@ -1440,10 +1282,9 @@ struct server_context {
    }

    void send_rerank(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_rerank res;
        res.id    = slot.id_task;
-        res.error = false;
-        res.stop  = true;
+        res.index = slot.index;

        for (int i = 0; i < batch.n_tokens; ++i) {
            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@ -1458,21 +1299,14 @@ struct server_context {
            if (embd == NULL) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);

-                res.data = json {
-                    {"index", slot.index},
-                    {"score", -1e6},
-                };
-
+                res.score = -1e6;
                continue;
            }

-            res.data = json {
-                {"index", slot.index},
-                {"score", embd[0]},
-            };
+            res.score = embd[0];
        }

-        SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score);

        queue_results.send(res);
    }
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@ -0,0 +1,191 @@
+#pragma once
+
+#include "common.h"
+#include "llama.h"
+#include "sampling.h"
+#include "speculative.h"
+
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+
+#include <string>
+#include <memory>
+#include <unordered_set>
+
+using json = nlohmann::ordered_json;
+
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+enum slot_state {
+    SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
+};
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+enum server_task_type {
+    SERVER_TASK_TYPE_INFERENCE,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+enum server_task_inf_type {
+    SERVER_TASK_INF_TYPE_COMPLETION,
+    SERVER_TASK_INF_TYPE_EMBEDDING,
+    SERVER_TASK_INF_TYPE_RERANK,
+    SERVER_TASK_INF_TYPE_INFILL,
+};
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+};
+
+struct server_task {
+    int id        = -1; // to be filled by server_queue
+    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
+
+    llama_tokens prompt_tokens;
+    server_task_type type;
+
+    // TODO @ngxson : we should get rid of json type here
+    json data;
+
+    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct result_timings {
+    int32_t prompt_n;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+};
+
+enum result_type {
+    RESULT_TYPE_CMPL_FINAL,
+    RESULT_TYPE_CMPL_PARTIAL,
+    RESULT_TYPE_EMBD,
+    RESULT_TYPE_RERANK,
+    RESULT_TYPE_ERROR,
+    RESULT_TYPE_UNKNOWN, // will throw an error
+};
+
+struct server_task_result {
+    result_type type = RESULT_TYPE_UNKNOWN;
+    int id           = -1;
+    int id_slot      = -1;
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    result_type type = RESULT_TYPE_CMPL_FINAL;
+    int index = 0;
+    std::string content;
+    bool stream;
+    bool timings_per_token;
+    result_timings timings;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t has_new_line;
+    int32_t stopping_word;
+    int32_t n_tokens_cached;
+    stop_type stop = STOP_TYPE_NONE;
+    std::vector<completion_token_output> probs_output;
+
+    slot_params params;
+};
+
+struct completion_token_output {
+    llama_token tok;
+    std::string text_to_send;
+    struct token_prob {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    result_type type = RESULT_TYPE_CMPL_PARTIAL;
+    int index = 0;
+    std::string content;
+    stop_type stop = STOP_TYPE_NONE;
+    std::vector<completion_token_output> probs_output;
+    result_timings timings;
+};
+
+struct server_task_result_embd : server_task_result {
+    result_type type = RESULT_TYPE_EMBD;
+    int index = 0;
+    std::vector<float> embedding;
+};
+
+struct server_task_result_rerank : server_task_result {
+    result_type type = RESULT_TYPE_RERANK;
+    int index = 0;
+    float score;
+};
+
+struct server_task_result_error : server_task_result {
+    result_type type = RESULT_TYPE_ERROR;
+    int index = 0;
+    error_type err_type;
+    std::string err_msg;
+};
+
+struct slot_params {
+    bool stream       = true;
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<std::string> antiprompt;
+    bool timings_per_token = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+};
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "server.hpp"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)

-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
-enum error_type {
-    ERROR_TYPE_INVALID_REQUEST,
-    ERROR_TYPE_AUTHENTICATION,
-    ERROR_TYPE_SERVER,
-    ERROR_TYPE_NOT_FOUND,
-    ERROR_TYPE_PERMISSION,
-    ERROR_TYPE_UNAVAILABLE, // custom error
-    ERROR_TYPE_NOT_SUPPORTED, // custom error
-};
-
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
@ -485,18 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
    return out;
 }

-struct completion_token_output {
-    llama_token tok;
-    std::string text_to_send;
-
-    struct token_prob {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-};
-
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
    json out = json::array();