remove server.hpp
This commit is contained in:
parent
8ab173c865
commit
1cf769be67
3 changed files with 675 additions and 695 deletions
|
@ -1,5 +1,4 @@
|
|||
#include "utils.hpp"
|
||||
#include "server.hpp"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
@ -33,9 +32,682 @@
|
|||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum stop_type {
|
||||
STOP_TYPE_NONE,
|
||||
STOP_TYPE_EOS,
|
||||
STOP_TYPE_WORD,
|
||||
STOP_TYPE_LIMIT,
|
||||
};
|
||||
|
||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||
SLOT_STATE_PROCESSING_PROMPT,
|
||||
SLOT_STATE_DONE_PROMPT,
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
enum server_task_type {
|
||||
SERVER_TASK_TYPE_INFERENCE,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
enum server_task_inf_type {
|
||||
SERVER_TASK_INF_TYPE_COMPLETION,
|
||||
SERVER_TASK_INF_TYPE_EMBEDDING,
|
||||
SERVER_TASK_INF_TYPE_RERANK,
|
||||
SERVER_TASK_INF_TYPE_INFILL,
|
||||
};
|
||||
|
||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||
enum error_type {
|
||||
ERROR_TYPE_INVALID_REQUEST,
|
||||
ERROR_TYPE_AUTHENTICATION,
|
||||
ERROR_TYPE_SERVER,
|
||||
ERROR_TYPE_NOT_FOUND,
|
||||
ERROR_TYPE_PERMISSION,
|
||||
ERROR_TYPE_UNAVAILABLE, // custom error
|
||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
int id = -1; // to be filled by server_queue
|
||||
int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
|
||||
|
||||
llama_tokens prompt_tokens;
|
||||
server_task_type type;
|
||||
|
||||
// TODO @ngxson : we should get rid of json type here
|
||||
json data;
|
||||
|
||||
server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
|
||||
|
||||
// utility function
|
||||
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
||||
std::unordered_set<int> ids(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
ids.insert(tasks[i].id);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
};
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
|
||||
|
||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
bool timings_per_token = false;
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
|
||||
// params only used in to_json()
|
||||
int32_t n_ctx;
|
||||
uint32_t seed_cur;
|
||||
bool can_speculative;
|
||||
|
||||
// OAI-compat fields
|
||||
bool oaicompat = false;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
json to_json() {
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(sampling.samplers.size());
|
||||
for (const auto & sampler : sampling.samplers) {
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_predict", n_predict}, // Server configured n_predict
|
||||
{"temperature", sampling.temp},
|
||||
{"dynatemp_range", sampling.dynatemp_range},
|
||||
{"dynatemp_exponent", sampling.dynatemp_exponent},
|
||||
{"top_k", sampling.top_k},
|
||||
{"top_p", sampling.top_p},
|
||||
{"min_p", sampling.min_p},
|
||||
{"xtc_probability", sampling.xtc_probability},
|
||||
{"xtc_threshold", sampling.xtc_threshold},
|
||||
{"typical_p", sampling.typ_p},
|
||||
{"repeat_last_n", sampling.penalty_last_n},
|
||||
{"repeat_penalty", sampling.penalty_repeat},
|
||||
{"presence_penalty", sampling.penalty_present},
|
||||
{"frequency_penalty", sampling.penalty_freq},
|
||||
{"dry_multiplier", sampling.dry_multiplier},
|
||||
{"dry_base", sampling.dry_base},
|
||||
{"dry_allowed_length", sampling.dry_allowed_length},
|
||||
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
|
||||
{"dry_sequence_breakers", sampling.dry_sequence_breakers},
|
||||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"penalize_nl", sampling.penalize_nl},
|
||||
{"stop", antiprompt},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
{"n_discard", n_discard},
|
||||
{"ignore_eos", sampling.ignore_eos},
|
||||
{"stream", stream},
|
||||
//{"logit_bias", sampling.logit_bias},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"grammar", sampling.grammar},
|
||||
{"samplers", samplers},
|
||||
{"speculative", can_speculative},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"timings_per_token", timings_per_token},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
double prompt_per_second;
|
||||
|
||||
int32_t predicted_n = -1;
|
||||
double predicted_ms;
|
||||
double predicted_per_token_ms;
|
||||
double predicted_per_second;
|
||||
|
||||
json to_json() {
|
||||
return {
|
||||
{"prompt_n", prompt_n},
|
||||
{"prompt_ms", prompt_ms},
|
||||
{"prompt_per_token_ms", prompt_per_token_ms},
|
||||
{"prompt_per_second", prompt_per_second},
|
||||
|
||||
{"predicted_n", predicted_n},
|
||||
{"predicted_ms", predicted_ms},
|
||||
{"predicted_per_token_ms", predicted_per_token_ms},
|
||||
{"predicted_per_second", predicted_per_second},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
virtual bool is_error() {
|
||||
// only used by server_task_result_error
|
||||
return false;
|
||||
}
|
||||
virtual bool is_stop() {
|
||||
// only used by server_task_result_cmpl_partial
|
||||
return false;
|
||||
}
|
||||
virtual int get_index() {
|
||||
return -1;
|
||||
}
|
||||
virtual json to_json() = 0;
|
||||
virtual json to_json_oai_compat() {
|
||||
// used by server_task_result_cmpl_final and server_task_result_cmpl_partial
|
||||
return json();
|
||||
}
|
||||
virtual ~server_task_result() = default;
|
||||
};
|
||||
|
||||
// using shared_ptr for polymorphism of server_task_result
|
||||
using task_result_ptr = std::unique_ptr<server_task_result>;
|
||||
|
||||
inline std::string stop_type_to_str(stop_type type) {
|
||||
switch (type) {
|
||||
case STOP_TYPE_EOS: return "eos";
|
||||
case STOP_TYPE_WORD: return "word";
|
||||
case STOP_TYPE_LIMIT: return "limit";
|
||||
default: return "none";
|
||||
}
|
||||
}
|
||||
|
||||
struct completion_token_output {
|
||||
llama_token tok;
|
||||
std::string text_to_send;
|
||||
struct token_prob {
|
||||
llama_token tok;
|
||||
float prob;
|
||||
};
|
||||
std::vector<token_prob> probs;
|
||||
|
||||
json to_json(const llama_context * ctx) const {
|
||||
json probs_for_token = json::array();
|
||||
for (const auto & p : probs) {
|
||||
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
||||
probs_for_token.push_back(json {
|
||||
{"tok_str", tok_str},
|
||||
{"prob", p.prob},
|
||||
});
|
||||
}
|
||||
return probs_for_token;
|
||||
}
|
||||
|
||||
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
||||
json out = json::array();
|
||||
for (const auto & prob : probs) {
|
||||
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
||||
out.push_back(json {
|
||||
{"content", tok_str},
|
||||
{"probs", prob.to_json(ctx)},
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
bool stream;
|
||||
result_timings timings;
|
||||
std::string prompt;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
int32_t n_tokens_cached;
|
||||
int32_t has_new_line;
|
||||
std::string stopping_word;
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
std::vector<completion_token_output> probs_output;
|
||||
|
||||
slot_params generation_params;
|
||||
|
||||
// OAI-compat fields
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
// non-OAI-compat JSON
|
||||
return json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"id_slot", id_slot},
|
||||
{"stop", true},
|
||||
{"model", oaicompat_model},
|
||||
{"tokens_predicted", n_decoded},
|
||||
{"tokens_evaluated", n_prompt_tokens},
|
||||
{"generation_settings", generation_params.to_json()},
|
||||
{"prompt", prompt},
|
||||
{"has_new_line", has_new_line},
|
||||
{"truncated", truncated},
|
||||
{"stop_type", stop_type_to_str(stop)},
|
||||
{"stopping_word", stopping_word},
|
||||
{"tokens_cached", n_tokens_cached},
|
||||
{"timings", timings.to_json()},
|
||||
};
|
||||
}
|
||||
|
||||
virtual json to_json_oai_compat() override {
|
||||
std::string finish_reason = "length";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
|
||||
json choices = json::array({json{
|
||||
{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"message", json{
|
||||
{"content", content},
|
||||
{"role", "assistant"}
|
||||
}
|
||||
}}});
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json res = json {
|
||||
{"choices", choices},
|
||||
{"created", t},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion"},
|
||||
{"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}},
|
||||
{"id", oaicompat_cmpl_id}
|
||||
};
|
||||
|
||||
// extra fields for debugging purposes
|
||||
if (verbose) {
|
||||
res["__verbose"] = to_json();
|
||||
}
|
||||
|
||||
// TODO: fix this
|
||||
// if (result.contains("completion_probabilities")) {
|
||||
// res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||
// }
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_partial : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
std::vector<completion_token_output> probs_output;
|
||||
result_timings timings;
|
||||
|
||||
// OAI-compat fields
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return stop != STOP_TYPE_NONE;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
bool is_stop = stop != STOP_TYPE_NONE;
|
||||
// non-OAI-compat JSON
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"stop_type", stop_type_to_str(stop)},
|
||||
{"stop", is_stop},
|
||||
{"id_slot", id_slot},
|
||||
{"tokens_predicted", n_decoded},
|
||||
{"tokens_evaluated", n_prompt_tokens},
|
||||
};
|
||||
// populate the timings object when needed (usually for the last response or with timings_per_token enabled)
|
||||
if (timings.prompt_n > 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_stop) {
|
||||
res.push_back({"truncated", truncated});
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
virtual json to_json_oai_compat() override {
|
||||
bool first = n_decoded == 0;
|
||||
|
||||
std::string finish_reason;
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
} else if (stop == STOP_TYPE_LIMIT) {
|
||||
finish_reason = "length";
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json choices;
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
} else {
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||
// with empty content, we ignore these at the calee site.
|
||||
if (content.empty()) {
|
||||
return std::vector<json>({json::object()});
|
||||
}
|
||||
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
||||
json ret = json {
|
||||
{"choices", choices},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}
|
||||
};
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
ret.push_back({"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}});
|
||||
}
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_embd : server_task_result {
|
||||
int index = 0;
|
||||
std::vector<float> embedding;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"embedding", embedding},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_rerank : server_task_result {
|
||||
int index = 0;
|
||||
float score = -1e6;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"score", score},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// this function maybe used outside of server_task_result_error
|
||||
static json format_error_response(const std::string & message, const enum error_type type) {
|
||||
std::string type_str;
|
||||
int code = 500;
|
||||
switch (type) {
|
||||
case ERROR_TYPE_INVALID_REQUEST:
|
||||
type_str = "invalid_request_error";
|
||||
code = 400;
|
||||
break;
|
||||
case ERROR_TYPE_AUTHENTICATION:
|
||||
type_str = "authentication_error";
|
||||
code = 401;
|
||||
break;
|
||||
case ERROR_TYPE_NOT_FOUND:
|
||||
type_str = "not_found_error";
|
||||
code = 404;
|
||||
break;
|
||||
case ERROR_TYPE_SERVER:
|
||||
type_str = "server_error";
|
||||
code = 500;
|
||||
break;
|
||||
case ERROR_TYPE_PERMISSION:
|
||||
type_str = "permission_error";
|
||||
code = 403;
|
||||
break;
|
||||
case ERROR_TYPE_NOT_SUPPORTED:
|
||||
type_str = "not_supported_error";
|
||||
code = 501;
|
||||
break;
|
||||
case ERROR_TYPE_UNAVAILABLE:
|
||||
type_str = "unavailable_error";
|
||||
code = 503;
|
||||
break;
|
||||
}
|
||||
return json {
|
||||
{"code", code},
|
||||
{"message", message},
|
||||
{"type", type_str},
|
||||
};
|
||||
}
|
||||
|
||||
struct server_task_result_error : server_task_result {
|
||||
int index = 0;
|
||||
error_type err_type = ERROR_TYPE_SERVER;
|
||||
std::string err_msg;
|
||||
|
||||
virtual bool is_error() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return format_error_response(err_msg, err_type);
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_metrics : server_task_result {
|
||||
int n_idle_slots;
|
||||
int n_processing_slots;
|
||||
int n_tasks_deferred;
|
||||
int64_t t_start;
|
||||
|
||||
int32_t kv_cache_tokens_count;
|
||||
int32_t kv_cache_used_cells;
|
||||
|
||||
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t t_prompt_processing_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
uint64_t n_decode_total = 0;
|
||||
uint64_t n_busy_slots_total = 0;
|
||||
|
||||
// TODO: get rid of this json object and use to_json() instead
|
||||
json slots_data = json::array();
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "deferred", n_tasks_deferred },
|
||||
{ "t_start", t_start },
|
||||
|
||||
{ "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
|
||||
{ "t_tokens_generation_total", t_tokens_generation_total },
|
||||
{ "n_tokens_predicted_total", n_tokens_predicted_total },
|
||||
{ "t_prompt_processing_total", t_prompt_processing_total },
|
||||
|
||||
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
|
||||
{ "t_prompt_processing", t_prompt_processing },
|
||||
{ "n_tokens_predicted", n_tokens_predicted },
|
||||
{ "t_tokens_generation", t_tokens_generation },
|
||||
|
||||
{ "n_decode_total", n_decode_total },
|
||||
{ "n_busy_slots_total", n_busy_slots_total },
|
||||
|
||||
{ "kv_cache_tokens_count", kv_cache_tokens_count },
|
||||
{ "kv_cache_used_cells", kv_cache_used_cells },
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_slot_save_load : server_task_result {
|
||||
std::string filename;
|
||||
bool is_save; // true = save, false = load
|
||||
|
||||
size_t n_tokens;
|
||||
size_t n_bytes;
|
||||
double t_ms;
|
||||
|
||||
virtual json to_json() override {
|
||||
if (is_save) {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_saved", n_tokens },
|
||||
{ "n_written", n_bytes },
|
||||
{ "timings", {
|
||||
{ "save_ms", t_ms }
|
||||
}},
|
||||
};
|
||||
} else {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_restored", n_tokens },
|
||||
{ "n_read", n_bytes },
|
||||
{ "timings", {
|
||||
{ "restore_ms", t_ms }
|
||||
}},
|
||||
};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_slot_erase : server_task_result {
|
||||
size_t n_erased;
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "n_erased", n_erased },
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_apply_lora : server_task_result {
|
||||
virtual json to_json() override {
|
||||
return json {{ "success", true }};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
int id_task = -1;
|
||||
|
@ -786,8 +1458,9 @@ struct server_context {
|
|||
const auto & data = task.data;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
|
||||
slot.params.oaicompat = true;
|
||||
slot.params.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
slot.params.oaicompat_model = json_value(data, "model", model_name);
|
||||
slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string());
|
||||
} else {
|
||||
slot.params.oaicompat = false;
|
||||
|
@ -2835,8 +3508,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
|
||||
std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
|
||||
return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true);
|
||||
};
|
||||
|
||||
|
|
|
@ -1,665 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
#include "speculative.h"
|
||||
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum stop_type {
|
||||
STOP_TYPE_NONE,
|
||||
STOP_TYPE_EOS,
|
||||
STOP_TYPE_WORD,
|
||||
STOP_TYPE_LIMIT,
|
||||
};
|
||||
|
||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||
SLOT_STATE_PROCESSING_PROMPT,
|
||||
SLOT_STATE_DONE_PROMPT,
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
enum server_task_type {
|
||||
SERVER_TASK_TYPE_INFERENCE,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
enum server_task_inf_type {
|
||||
SERVER_TASK_INF_TYPE_COMPLETION,
|
||||
SERVER_TASK_INF_TYPE_EMBEDDING,
|
||||
SERVER_TASK_INF_TYPE_RERANK,
|
||||
SERVER_TASK_INF_TYPE_INFILL,
|
||||
};
|
||||
|
||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||
enum error_type {
|
||||
ERROR_TYPE_INVALID_REQUEST,
|
||||
ERROR_TYPE_AUTHENTICATION,
|
||||
ERROR_TYPE_SERVER,
|
||||
ERROR_TYPE_NOT_FOUND,
|
||||
ERROR_TYPE_PERMISSION,
|
||||
ERROR_TYPE_UNAVAILABLE, // custom error
|
||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
int id = -1; // to be filled by server_queue
|
||||
int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
|
||||
|
||||
llama_tokens prompt_tokens;
|
||||
server_task_type type;
|
||||
|
||||
// TODO @ngxson : we should get rid of json type here
|
||||
json data;
|
||||
|
||||
server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
|
||||
|
||||
// utility function
|
||||
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
||||
std::unordered_set<int> ids(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
ids.insert(tasks[i].id);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
};
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
|
||||
|
||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
bool timings_per_token = false;
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
|
||||
// params only used in to_json()
|
||||
int32_t n_ctx;
|
||||
uint32_t seed_cur;
|
||||
bool can_speculative;
|
||||
|
||||
// OAI-compat fields
|
||||
bool oaicompat = false;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
json to_json() {
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(sampling.samplers.size());
|
||||
for (const auto & sampler : sampling.samplers) {
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_predict", n_predict}, // Server configured n_predict
|
||||
{"temperature", sampling.temp},
|
||||
{"dynatemp_range", sampling.dynatemp_range},
|
||||
{"dynatemp_exponent", sampling.dynatemp_exponent},
|
||||
{"top_k", sampling.top_k},
|
||||
{"top_p", sampling.top_p},
|
||||
{"min_p", sampling.min_p},
|
||||
{"xtc_probability", sampling.xtc_probability},
|
||||
{"xtc_threshold", sampling.xtc_threshold},
|
||||
{"typical_p", sampling.typ_p},
|
||||
{"repeat_last_n", sampling.penalty_last_n},
|
||||
{"repeat_penalty", sampling.penalty_repeat},
|
||||
{"presence_penalty", sampling.penalty_present},
|
||||
{"frequency_penalty", sampling.penalty_freq},
|
||||
{"dry_multiplier", sampling.dry_multiplier},
|
||||
{"dry_base", sampling.dry_base},
|
||||
{"dry_allowed_length", sampling.dry_allowed_length},
|
||||
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
|
||||
{"dry_sequence_breakers", sampling.dry_sequence_breakers},
|
||||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"penalize_nl", sampling.penalize_nl},
|
||||
{"stop", antiprompt},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
{"n_discard", n_discard},
|
||||
{"ignore_eos", sampling.ignore_eos},
|
||||
{"stream", stream},
|
||||
//{"logit_bias", sampling.logit_bias},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"grammar", sampling.grammar},
|
||||
{"samplers", samplers},
|
||||
{"speculative", can_speculative},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"timings_per_token", timings_per_token},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
double prompt_per_second;
|
||||
|
||||
int32_t predicted_n = -1;
|
||||
double predicted_ms;
|
||||
double predicted_per_token_ms;
|
||||
double predicted_per_second;
|
||||
|
||||
json to_json() {
|
||||
return {
|
||||
{"prompt_n", prompt_n},
|
||||
{"prompt_ms", prompt_ms},
|
||||
{"prompt_per_token_ms", prompt_per_token_ms},
|
||||
{"prompt_per_second", prompt_per_second},
|
||||
|
||||
{"predicted_n", predicted_n},
|
||||
{"predicted_ms", predicted_ms},
|
||||
{"predicted_per_token_ms", predicted_per_token_ms},
|
||||
{"predicted_per_second", predicted_per_second},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
virtual bool is_error() {
|
||||
// only used by server_task_result_error
|
||||
return false;
|
||||
}
|
||||
virtual bool is_stop() {
|
||||
// only used by server_task_result_cmpl_partial
|
||||
return false;
|
||||
}
|
||||
virtual int get_index() {
|
||||
return -1;
|
||||
}
|
||||
virtual json to_json() = 0;
|
||||
virtual json to_json_oai_compat() {
|
||||
// used by server_task_result_cmpl_final and server_task_result_cmpl_partial
|
||||
return json();
|
||||
}
|
||||
virtual ~server_task_result() = default;
|
||||
};
|
||||
|
||||
inline std::string stop_type_to_str(stop_type type) {
|
||||
switch (type) {
|
||||
case STOP_TYPE_EOS: return "eos";
|
||||
case STOP_TYPE_WORD: return "word";
|
||||
case STOP_TYPE_LIMIT: return "limit";
|
||||
default: return "none";
|
||||
}
|
||||
}
|
||||
|
||||
struct completion_token_output {
|
||||
llama_token tok;
|
||||
std::string text_to_send;
|
||||
struct token_prob {
|
||||
llama_token tok;
|
||||
float prob;
|
||||
};
|
||||
std::vector<token_prob> probs;
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
bool stream;
|
||||
result_timings timings;
|
||||
std::string prompt;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
int32_t n_tokens_cached;
|
||||
int32_t has_new_line;
|
||||
std::string stopping_word;
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
std::vector<completion_token_output> probs_output;
|
||||
|
||||
slot_params generation_params;
|
||||
|
||||
// OAI-compat fields
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
// non-OAI-compat JSON
|
||||
return json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"id_slot", id_slot},
|
||||
{"stop", true},
|
||||
{"model", oaicompat_model},
|
||||
{"tokens_predicted", n_decoded},
|
||||
{"tokens_evaluated", n_prompt_tokens},
|
||||
{"generation_settings", generation_params.to_json()},
|
||||
{"prompt", prompt},
|
||||
{"has_new_line", has_new_line},
|
||||
{"truncated", truncated},
|
||||
{"stop_type", stop_type_to_str(stop)},
|
||||
{"stopping_word", stopping_word},
|
||||
{"tokens_cached", n_tokens_cached},
|
||||
{"timings", timings.to_json()},
|
||||
};
|
||||
}
|
||||
|
||||
virtual json to_json_oai_compat() override {
|
||||
std::string finish_reason = "length";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
|
||||
json choices = json::array({json{
|
||||
{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"message", json{
|
||||
{"content", content},
|
||||
{"role", "assistant"}
|
||||
}
|
||||
}}});
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json res = json {
|
||||
{"choices", choices},
|
||||
{"created", t},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion"},
|
||||
{"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens}
|
||||
}},
|
||||
{"id", oaicompat_cmpl_id}
|
||||
};
|
||||
|
||||
// extra fields for debugging purposes
|
||||
if (verbose) {
|
||||
res["__verbose"] = to_json();
|
||||
}
|
||||
|
||||
// TODO: fix this
|
||||
// if (result.contains("completion_probabilities")) {
|
||||
// res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||
// }
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_partial : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
std::vector<completion_token_output> probs_output;
|
||||
result_timings timings;
|
||||
|
||||
// OAI-compat fields
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
bool verbose = false;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return stop != STOP_TYPE_NONE;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
bool is_stop = stop != STOP_TYPE_NONE;
|
||||
// non-OAI-compat JSON
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"stop_type", stop_type_to_str(stop)},
|
||||
{"stop", is_stop},
|
||||
{"id_slot", id_slot},
|
||||
{"tokens_predicted", n_decoded},
|
||||
{"tokens_evaluated", n_prompt_tokens},
|
||||
};
|
||||
// populate the timings object when needed (usually for the last response or with timings_per_token enabled)
|
||||
if (timings.prompt_n > 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_stop) {
|
||||
res.push_back({"truncated", truncated});
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
virtual json to_json_oai_compat() override {
|
||||
bool first = n_decoded == 0;
|
||||
|
||||
std::string finish_reason;
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
finish_reason = "stop";
|
||||
} else if (stop == STOP_TYPE_LIMIT) {
|
||||
finish_reason = "length";
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json choices;
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
} else {
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||
// with empty content, we ignore these at the calee site.
|
||||
if (content.empty()) {
|
||||
return std::vector<json>({json::object()});
|
||||
}
|
||||
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
||||
json ret = json {
|
||||
{"choices", choices},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "chat.completion.chunk"}
|
||||
};
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
ret.push_back({"usage", json {
|
||||
{"completion_tokens", n_decoded},
|
||||
{"prompt_tokens", n_prompt_tokens},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}});
|
||||
}
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_embd : server_task_result {
|
||||
int index = 0;
|
||||
std::vector<float> embedding;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"embedding", embedding},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_rerank : server_task_result {
|
||||
int index = 0;
|
||||
float score = -1e6;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"score", score},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// this function maybe used outside of server_task_result_error
|
||||
static json format_error_response(const std::string & message, const enum error_type type) {
|
||||
std::string type_str;
|
||||
int code = 500;
|
||||
switch (type) {
|
||||
case ERROR_TYPE_INVALID_REQUEST:
|
||||
type_str = "invalid_request_error";
|
||||
code = 400;
|
||||
break;
|
||||
case ERROR_TYPE_AUTHENTICATION:
|
||||
type_str = "authentication_error";
|
||||
code = 401;
|
||||
break;
|
||||
case ERROR_TYPE_NOT_FOUND:
|
||||
type_str = "not_found_error";
|
||||
code = 404;
|
||||
break;
|
||||
case ERROR_TYPE_SERVER:
|
||||
type_str = "server_error";
|
||||
code = 500;
|
||||
break;
|
||||
case ERROR_TYPE_PERMISSION:
|
||||
type_str = "permission_error";
|
||||
code = 403;
|
||||
break;
|
||||
case ERROR_TYPE_NOT_SUPPORTED:
|
||||
type_str = "not_supported_error";
|
||||
code = 501;
|
||||
break;
|
||||
case ERROR_TYPE_UNAVAILABLE:
|
||||
type_str = "unavailable_error";
|
||||
code = 503;
|
||||
break;
|
||||
}
|
||||
return json {
|
||||
{"code", code},
|
||||
{"message", message},
|
||||
{"type", type_str},
|
||||
};
|
||||
}
|
||||
|
||||
struct server_task_result_error : server_task_result {
|
||||
int index = 0;
|
||||
error_type err_type = ERROR_TYPE_SERVER;
|
||||
std::string err_msg;
|
||||
|
||||
virtual bool is_error() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return format_error_response(err_msg, err_type);
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_metrics : server_task_result {
|
||||
int n_idle_slots;
|
||||
int n_processing_slots;
|
||||
int n_tasks_deferred;
|
||||
int64_t t_start;
|
||||
|
||||
int32_t kv_cache_tokens_count;
|
||||
int32_t kv_cache_used_cells;
|
||||
|
||||
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t t_prompt_processing_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
uint64_t n_decode_total = 0;
|
||||
uint64_t n_busy_slots_total = 0;
|
||||
|
||||
// TODO: get rid of this json object and use to_json() instead
|
||||
json slots_data = json::array();
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "deferred", n_tasks_deferred },
|
||||
{ "t_start", t_start },
|
||||
|
||||
{ "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
|
||||
{ "t_tokens_generation_total", t_tokens_generation_total },
|
||||
{ "n_tokens_predicted_total", n_tokens_predicted_total },
|
||||
{ "t_prompt_processing_total", t_prompt_processing_total },
|
||||
|
||||
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
|
||||
{ "t_prompt_processing", t_prompt_processing },
|
||||
{ "n_tokens_predicted", n_tokens_predicted },
|
||||
{ "t_tokens_generation", t_tokens_generation },
|
||||
|
||||
{ "n_decode_total", n_decode_total },
|
||||
{ "n_busy_slots_total", n_busy_slots_total },
|
||||
|
||||
{ "kv_cache_tokens_count", kv_cache_tokens_count },
|
||||
{ "kv_cache_used_cells", kv_cache_used_cells },
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_slot_save_load : server_task_result {
|
||||
std::string filename;
|
||||
bool is_save; // true = save, false = load
|
||||
|
||||
size_t n_tokens;
|
||||
size_t n_bytes;
|
||||
double t_ms;
|
||||
|
||||
virtual json to_json() override {
|
||||
if (is_save) {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_saved", n_tokens },
|
||||
{ "n_written", n_bytes },
|
||||
{ "timings", {
|
||||
{ "save_ms", t_ms }
|
||||
}},
|
||||
};
|
||||
} else {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_restored", n_tokens },
|
||||
{ "n_read", n_bytes },
|
||||
{ "timings", {
|
||||
{ "restore_ms", t_ms }
|
||||
}},
|
||||
};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_slot_erase : server_task_result {
|
||||
size_t n_erased;
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "n_erased", n_erased },
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result_apply_lora : server_task_result {
|
||||
virtual json to_json() override {
|
||||
return json {{ "success", true }};
|
||||
}
|
||||
};
|
|
@ -3,7 +3,6 @@
|
|||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "server.hpp"
|
||||
|
||||
#ifndef NDEBUG
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
|
@ -476,31 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
|
|||
return out;
|
||||
}
|
||||
|
||||
// convert a vector of completion_token_output to json
|
||||
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
||||
json out = json::array();
|
||||
|
||||
for (const auto & prob : probs) {
|
||||
json probs_for_token = json::array();
|
||||
|
||||
for (const auto & p : prob.probs) {
|
||||
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
||||
probs_for_token.push_back(json {
|
||||
{"tok_str", tok_str},
|
||||
{"prob", p.prob},
|
||||
});
|
||||
}
|
||||
|
||||
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
||||
out.push_back(json {
|
||||
{"content", tok_str},
|
||||
{"probs", probs_for_token},
|
||||
});
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
||||
const std::string str =
|
||||
std::string(event) + ": " +
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue