server : (refactoring) reduce usage of json internally
This commit is contained in:
parent
cc98896db8
commit
b7d38eef0c
3 changed files with 245 additions and 242 deletions
|
@ -1,4 +1,5 @@
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
|
#include "server.hpp"
|
||||||
|
|
||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
@ -32,90 +33,6 @@
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
enum stop_type {
|
|
||||||
STOP_TYPE_FULL,
|
|
||||||
STOP_TYPE_PARTIAL,
|
|
||||||
};
|
|
||||||
|
|
||||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
|
||||||
enum slot_state {
|
|
||||||
SLOT_STATE_IDLE,
|
|
||||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
|
||||||
SLOT_STATE_PROCESSING_PROMPT,
|
|
||||||
SLOT_STATE_DONE_PROMPT,
|
|
||||||
SLOT_STATE_GENERATING,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum server_state {
|
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
|
||||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
|
||||||
};
|
|
||||||
|
|
||||||
enum server_task_type {
|
|
||||||
SERVER_TASK_TYPE_INFERENCE,
|
|
||||||
SERVER_TASK_TYPE_CANCEL,
|
|
||||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
|
||||||
SERVER_TASK_TYPE_METRICS,
|
|
||||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
|
||||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
|
||||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
|
||||||
SERVER_TASK_TYPE_SET_LORA,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum server_task_inf_type {
|
|
||||||
SERVER_TASK_INF_TYPE_COMPLETION,
|
|
||||||
SERVER_TASK_INF_TYPE_EMBEDDING,
|
|
||||||
SERVER_TASK_INF_TYPE_RERANK,
|
|
||||||
SERVER_TASK_INF_TYPE_INFILL,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct server_task {
|
|
||||||
int id = -1; // to be filled by server_queue
|
|
||||||
int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
|
|
||||||
|
|
||||||
llama_tokens prompt_tokens;
|
|
||||||
server_task_type type;
|
|
||||||
json data;
|
|
||||||
|
|
||||||
server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
|
|
||||||
|
|
||||||
// utility function
|
|
||||||
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
|
||||||
std::unordered_set<int> ids(tasks.size());
|
|
||||||
for (size_t i = 0; i < tasks.size(); i++) {
|
|
||||||
ids.insert(tasks[i].id);
|
|
||||||
}
|
|
||||||
return ids;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct server_task_result {
|
|
||||||
int id = -1;
|
|
||||||
|
|
||||||
json data;
|
|
||||||
|
|
||||||
bool stop;
|
|
||||||
bool error;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct slot_params {
|
|
||||||
bool stream = true;
|
|
||||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
|
||||||
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
||||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
|
||||||
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
|
|
||||||
|
|
||||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
|
||||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
|
||||||
|
|
||||||
struct common_params_sampling sampling;
|
|
||||||
struct common_params_speculative speculative;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
int id;
|
int id;
|
||||||
int id_task = -1;
|
int id_task = -1;
|
||||||
|
@ -166,8 +83,6 @@ struct server_slot {
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
|
|
||||||
bool timings_per_token = false;
|
|
||||||
|
|
||||||
bool oaicompat = false;
|
bool oaicompat = false;
|
||||||
|
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
|
@ -255,37 +170,39 @@ struct server_slot {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_timings() const {
|
result_timings get_timings() const {
|
||||||
return json {
|
result_timings timings;
|
||||||
{"prompt_n", n_prompt_tokens_processed},
|
timings.prompt_n = n_prompt_tokens_processed;
|
||||||
{"prompt_ms", t_prompt_processing},
|
timings.prompt_ms = t_prompt_processing;
|
||||||
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
|
timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
|
||||||
{"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed},
|
timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||||
|
|
||||||
{"predicted_n", n_decoded},
|
timings.predicted_n = n_decoded;
|
||||||
{"predicted_ms", t_token_generation},
|
timings.predicted_ms = t_token_generation;
|
||||||
{"predicted_per_token_ms", t_token_generation / n_decoded},
|
timings.predicted_per_token_ms = t_token_generation / n_decoded;
|
||||||
{"predicted_per_second", 1e3 / t_token_generation * n_decoded},
|
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
|
||||||
};
|
|
||||||
|
return timings;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
|
size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
|
||||||
size_t stop_pos = std::string::npos;
|
size_t stop_pos = std::string::npos;
|
||||||
|
|
||||||
for (const std::string & word : params.antiprompt) {
|
for (const std::string & word : params.antiprompt) {
|
||||||
size_t pos;
|
size_t pos;
|
||||||
|
|
||||||
if (type == STOP_TYPE_FULL) {
|
if (is_full_stop) {
|
||||||
const size_t tmp = word.size() + last_token_size;
|
const size_t tmp = word.size() + last_token_size;
|
||||||
const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
|
const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
|
||||||
|
|
||||||
pos = text.find(word, from_pos);
|
pos = text.find(word, from_pos);
|
||||||
} else {
|
} else {
|
||||||
|
// otherwise, partial stop
|
||||||
pos = find_partial_stop_string(word, text);
|
pos = find_partial_stop_string(word, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
||||||
if (type == STOP_TYPE_FULL) {
|
if (is_full_stop) {
|
||||||
stopped_word = true;
|
stopped_word = true;
|
||||||
stopping_word = word;
|
stopping_word = word;
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
|
@ -1108,14 +1025,14 @@ struct server_context {
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
bool send_text = true;
|
bool send_text = true;
|
||||||
|
|
||||||
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
|
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
slot.generated_text.erase(
|
slot.generated_text.erase(
|
||||||
slot.generated_text.begin() + pos + stop_pos,
|
slot.generated_text.begin() + pos + stop_pos,
|
||||||
slot.generated_text.end());
|
slot.generated_text.end());
|
||||||
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
} else if (slot.has_next_token) {
|
} else if (slot.has_next_token) {
|
||||||
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
|
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
|
||||||
send_text = stop_pos == std::string::npos;
|
send_text = stop_pos == std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1229,60 +1146,6 @@ struct server_context {
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_generation(const server_slot & slot) const {
|
|
||||||
std::vector<std::string> samplers;
|
|
||||||
samplers.reserve(slot.params.sampling.samplers.size());
|
|
||||||
for (const auto & sampler : slot.params.sampling.samplers) {
|
|
||||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
|
||||||
}
|
|
||||||
|
|
||||||
return json {
|
|
||||||
{"n_ctx", slot.n_ctx},
|
|
||||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
|
||||||
{"model", params_base.model_alias},
|
|
||||||
{"seed", slot.params.sampling.seed},
|
|
||||||
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
|
|
||||||
{"temperature", slot.params.sampling.temp},
|
|
||||||
{"dynatemp_range", slot.params.sampling.dynatemp_range},
|
|
||||||
{"dynatemp_exponent", slot.params.sampling.dynatemp_exponent},
|
|
||||||
{"top_k", slot.params.sampling.top_k},
|
|
||||||
{"top_p", slot.params.sampling.top_p},
|
|
||||||
{"min_p", slot.params.sampling.min_p},
|
|
||||||
{"xtc_probability", slot.params.sampling.xtc_probability},
|
|
||||||
{"xtc_threshold", slot.params.sampling.xtc_threshold},
|
|
||||||
{"typical_p", slot.params.sampling.typ_p},
|
|
||||||
{"repeat_last_n", slot.params.sampling.penalty_last_n},
|
|
||||||
{"repeat_penalty", slot.params.sampling.penalty_repeat},
|
|
||||||
{"presence_penalty", slot.params.sampling.penalty_present},
|
|
||||||
{"frequency_penalty", slot.params.sampling.penalty_freq},
|
|
||||||
{"dry_multiplier", slot.params.sampling.dry_multiplier},
|
|
||||||
{"dry_base", slot.params.sampling.dry_base},
|
|
||||||
{"dry_allowed_length", slot.params.sampling.dry_allowed_length},
|
|
||||||
{"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n},
|
|
||||||
{"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers},
|
|
||||||
{"mirostat", slot.params.sampling.mirostat},
|
|
||||||
{"mirostat_tau", slot.params.sampling.mirostat_tau},
|
|
||||||
{"mirostat_eta", slot.params.sampling.mirostat_eta},
|
|
||||||
{"penalize_nl", slot.params.sampling.penalize_nl},
|
|
||||||
{"stop", slot.params.antiprompt},
|
|
||||||
{"max_tokens", slot.params.n_predict}, // User configured n_predict
|
|
||||||
{"n_keep", slot.params.n_keep},
|
|
||||||
{"n_discard", slot.params.n_discard},
|
|
||||||
{"ignore_eos", slot.params.sampling.ignore_eos},
|
|
||||||
{"stream", slot.params.stream},
|
|
||||||
//{"logit_bias", slot.params.sampling.logit_bias},
|
|
||||||
{"n_probs", slot.params.sampling.n_probs},
|
|
||||||
{"min_keep", slot.params.sampling.min_keep},
|
|
||||||
{"grammar", slot.params.sampling.grammar},
|
|
||||||
{"samplers", samplers},
|
|
||||||
{"speculative", slot.can_speculate()},
|
|
||||||
{"speculative.n_max", slot.params.speculative.n_max},
|
|
||||||
{"speculative.n_min", slot.params.speculative.n_min},
|
|
||||||
{"speculative.p_min", slot.params.speculative.p_min},
|
|
||||||
{"timings_per_token", slot.timings_per_token},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
||||||
send_error(task.id, error, type);
|
send_error(task.id, error, type);
|
||||||
}
|
}
|
||||||
|
@ -1294,27 +1157,18 @@ struct server_context {
|
||||||
void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
||||||
SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
|
SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
|
||||||
|
|
||||||
server_task_result res;
|
server_task_result_error res;
|
||||||
res.id = id_task;
|
res.id = id_task;
|
||||||
res.stop = false;
|
res.err_type = type;
|
||||||
res.error = true;
|
res.err_msg = error;
|
||||||
res.data = format_error_response(error, type);
|
|
||||||
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_partial_response(server_slot & slot, completion_token_output tkn) {
|
void send_partial_response(server_slot & slot, completion_token_output tkn) {
|
||||||
server_task_result res;
|
server_task_result_cmpl_partial res;
|
||||||
res.id = slot.id_task;
|
res.id = slot.id_task;
|
||||||
res.error = false;
|
res.content = tkn.text_to_send;
|
||||||
res.stop = false;
|
|
||||||
res.data = json {
|
|
||||||
{"content", tkn.text_to_send},
|
|
||||||
{"stop", false},
|
|
||||||
{"id_slot", slot.id},
|
|
||||||
{"multimodal", false},
|
|
||||||
{"index", slot.index},
|
|
||||||
};
|
|
||||||
|
|
||||||
if (slot.params.sampling.n_probs > 0) {
|
if (slot.params.sampling.n_probs > 0) {
|
||||||
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||||
|
@ -1323,30 +1177,35 @@ struct server_context {
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output;
|
std::vector<completion_token_output> probs_output;
|
||||||
if (probs_pos < probs_stop_pos) {
|
if (probs_pos < probs_stop_pos) {
|
||||||
probs_output = std::vector<completion_token_output>(
|
res.probs_output = std::vector<completion_token_output>(
|
||||||
slot.generated_token_probs.begin() + probs_pos,
|
slot.generated_token_probs.begin() + probs_pos,
|
||||||
slot.generated_token_probs.begin() + probs_stop_pos);
|
slot.generated_token_probs.begin() + probs_stop_pos);
|
||||||
}
|
}
|
||||||
slot.n_sent_token_probs = probs_stop_pos;
|
|
||||||
|
|
||||||
res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.oaicompat) {
|
if (slot.params.timings_per_token) {
|
||||||
res.data["oaicompat_token_ctr"] = slot.n_decoded;
|
res.timings = slot.get_timings();
|
||||||
res.data["model"] = slot.oaicompat_model;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.timings_per_token) {
|
|
||||||
res.data["timings"] = slot.get_formated_timings();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_final_response(const server_slot & slot) {
|
void send_final_response(const server_slot & slot) {
|
||||||
server_task_result res;
|
server_task_result_cmpl_final res;
|
||||||
res.id = slot.id_task;
|
res.id = slot.id_task;
|
||||||
|
res.id_slot = slot.id;
|
||||||
|
res.content = slot.generated_text;
|
||||||
|
|
||||||
|
res.n_decoded = slot.n_decoded;
|
||||||
|
res.n_prompt_tokens = slot.n_prompt_tokens;
|
||||||
|
res.has_new_line = slot.has_new_line;
|
||||||
|
res.n_tokens_cached = slot.n_past;
|
||||||
|
res.content = slot.generated_text;
|
||||||
|
|
||||||
|
res.params = slot.params; // copy the parameters
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
res.data = json {
|
res.data = json {
|
||||||
|
@ -1370,36 +1229,27 @@ struct server_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot.params.sampling.n_probs > 0) {
|
if (slot.params.sampling.n_probs > 0) {
|
||||||
std::vector<completion_token_output> probs;
|
|
||||||
if (!slot.params.stream && slot.stopped_word) {
|
if (!slot.params.stream && slot.stopped_word) {
|
||||||
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||||
|
|
||||||
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
||||||
probs = std::vector<completion_token_output>(
|
res.probs_output = std::vector<completion_token_output>(
|
||||||
slot.generated_token_probs.begin(),
|
slot.generated_token_probs.begin(),
|
||||||
slot.generated_token_probs.end() - safe_offset);
|
slot.generated_token_probs.end() - safe_offset);
|
||||||
} else {
|
} else {
|
||||||
probs = std::vector<completion_token_output>(
|
res.probs_output = std::vector<completion_token_output>(
|
||||||
slot.generated_token_probs.begin(),
|
slot.generated_token_probs.begin(),
|
||||||
slot.generated_token_probs.end());
|
slot.generated_token_probs.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.oaicompat) {
|
|
||||||
res.data["oaicompat_token_ctr"] = slot.n_decoded;
|
|
||||||
res.data["model"] = slot.oaicompat_model;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_embedding(const server_slot & slot, const llama_batch & batch) {
|
void send_embedding(const server_slot & slot, const llama_batch & batch) {
|
||||||
server_task_result res;
|
server_task_result_embd res;
|
||||||
res.id = slot.id_task;
|
res.id = slot.id_task;
|
||||||
res.error = false;
|
res.index = slot.index;
|
||||||
res.stop = true;
|
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
|
@ -1418,20 +1268,12 @@ struct server_context {
|
||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
||||||
|
|
||||||
res.data = json {
|
res.embedding = std::vector<float>(n_embd, 0.0f);
|
||||||
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
|
||||||
{"index", slot.index},
|
|
||||||
};
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_embd_normalize(embd, embd_res.data(), n_embd);
|
common_embd_normalize(embd, embd_res.data(), n_embd);
|
||||||
|
res.embedding = embd_res;
|
||||||
res.data = json {
|
|
||||||
{"embedding", embd_res},
|
|
||||||
{"index", slot.index},
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "%s", "sending embeddings\n");
|
SLT_DBG(slot, "%s", "sending embeddings\n");
|
||||||
|
@ -1440,10 +1282,9 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_rerank(const server_slot & slot, const llama_batch & batch) {
|
void send_rerank(const server_slot & slot, const llama_batch & batch) {
|
||||||
server_task_result res;
|
server_task_result_rerank res;
|
||||||
res.id = slot.id_task;
|
res.id = slot.id_task;
|
||||||
res.error = false;
|
res.index = slot.index;
|
||||||
res.stop = true;
|
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||||
|
@ -1458,21 +1299,14 @@ struct server_context {
|
||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
||||||
|
|
||||||
res.data = json {
|
res.score = -1e6;
|
||||||
{"index", slot.index},
|
|
||||||
{"score", -1e6},
|
|
||||||
};
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.data = json {
|
res.score = embd[0];
|
||||||
{"index", slot.index},
|
|
||||||
{"score", embd[0]},
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
|
SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score);
|
||||||
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
191
examples/server/server.hpp
Normal file
191
examples/server/server.hpp
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "speculative.h"
|
||||||
|
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
|
#include "json.hpp"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
enum stop_type {
|
||||||
|
STOP_TYPE_NONE,
|
||||||
|
STOP_TYPE_EOS,
|
||||||
|
STOP_TYPE_WORD,
|
||||||
|
STOP_TYPE_LIMIT,
|
||||||
|
};
|
||||||
|
|
||||||
|
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||||
|
enum slot_state {
|
||||||
|
SLOT_STATE_IDLE,
|
||||||
|
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||||
|
SLOT_STATE_PROCESSING_PROMPT,
|
||||||
|
SLOT_STATE_DONE_PROMPT,
|
||||||
|
SLOT_STATE_GENERATING,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum server_state {
|
||||||
|
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||||
|
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||||
|
};
|
||||||
|
|
||||||
|
enum server_task_type {
|
||||||
|
SERVER_TASK_TYPE_INFERENCE,
|
||||||
|
SERVER_TASK_TYPE_CANCEL,
|
||||||
|
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||||
|
SERVER_TASK_TYPE_METRICS,
|
||||||
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||||
|
SERVER_TASK_TYPE_SET_LORA,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum server_task_inf_type {
|
||||||
|
SERVER_TASK_INF_TYPE_COMPLETION,
|
||||||
|
SERVER_TASK_INF_TYPE_EMBEDDING,
|
||||||
|
SERVER_TASK_INF_TYPE_RERANK,
|
||||||
|
SERVER_TASK_INF_TYPE_INFILL,
|
||||||
|
};
|
||||||
|
|
||||||
|
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||||
|
enum error_type {
|
||||||
|
ERROR_TYPE_INVALID_REQUEST,
|
||||||
|
ERROR_TYPE_AUTHENTICATION,
|
||||||
|
ERROR_TYPE_SERVER,
|
||||||
|
ERROR_TYPE_NOT_FOUND,
|
||||||
|
ERROR_TYPE_PERMISSION,
|
||||||
|
ERROR_TYPE_UNAVAILABLE, // custom error
|
||||||
|
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task {
|
||||||
|
int id = -1; // to be filled by server_queue
|
||||||
|
int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
|
||||||
|
|
||||||
|
llama_tokens prompt_tokens;
|
||||||
|
server_task_type type;
|
||||||
|
|
||||||
|
// TODO @ngxson : we should get rid of json type here
|
||||||
|
json data;
|
||||||
|
|
||||||
|
server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
|
||||||
|
|
||||||
|
// utility function
|
||||||
|
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
||||||
|
std::unordered_set<int> ids(tasks.size());
|
||||||
|
for (size_t i = 0; i < tasks.size(); i++) {
|
||||||
|
ids.insert(tasks[i].id);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct result_timings {
|
||||||
|
int32_t prompt_n;
|
||||||
|
double prompt_ms;
|
||||||
|
double prompt_per_token_ms;
|
||||||
|
double prompt_per_second;
|
||||||
|
|
||||||
|
int32_t predicted_n;
|
||||||
|
double predicted_ms;
|
||||||
|
double predicted_per_token_ms;
|
||||||
|
double predicted_per_second;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum result_type {
|
||||||
|
RESULT_TYPE_CMPL_FINAL,
|
||||||
|
RESULT_TYPE_CMPL_PARTIAL,
|
||||||
|
RESULT_TYPE_EMBD,
|
||||||
|
RESULT_TYPE_RERANK,
|
||||||
|
RESULT_TYPE_ERROR,
|
||||||
|
RESULT_TYPE_UNKNOWN, // will throw an error
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_UNKNOWN;
|
||||||
|
int id = -1;
|
||||||
|
int id_slot = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result_cmpl_final : server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_CMPL_FINAL;
|
||||||
|
int index = 0;
|
||||||
|
std::string content;
|
||||||
|
bool stream;
|
||||||
|
bool timings_per_token;
|
||||||
|
result_timings timings;
|
||||||
|
|
||||||
|
int32_t n_decoded;
|
||||||
|
int32_t n_prompt_tokens;
|
||||||
|
int32_t has_new_line;
|
||||||
|
int32_t stopping_word;
|
||||||
|
int32_t n_tokens_cached;
|
||||||
|
stop_type stop = STOP_TYPE_NONE;
|
||||||
|
std::vector<completion_token_output> probs_output;
|
||||||
|
|
||||||
|
slot_params params;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct completion_token_output {
|
||||||
|
llama_token tok;
|
||||||
|
std::string text_to_send;
|
||||||
|
struct token_prob {
|
||||||
|
llama_token tok;
|
||||||
|
float prob;
|
||||||
|
};
|
||||||
|
std::vector<token_prob> probs;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result_cmpl_partial : server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_CMPL_PARTIAL;
|
||||||
|
int index = 0;
|
||||||
|
std::string content;
|
||||||
|
stop_type stop = STOP_TYPE_NONE;
|
||||||
|
std::vector<completion_token_output> probs_output;
|
||||||
|
result_timings timings;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result_embd : server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_EMBD;
|
||||||
|
int index = 0;
|
||||||
|
std::vector<float> embedding;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result_rerank : server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_RERANK;
|
||||||
|
int index = 0;
|
||||||
|
float score;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_task_result_error : server_task_result {
|
||||||
|
result_type type = RESULT_TYPE_ERROR;
|
||||||
|
int index = 0;
|
||||||
|
error_type err_type;
|
||||||
|
std::string err_msg;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct slot_params {
|
||||||
|
bool stream = true;
|
||||||
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||||
|
|
||||||
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||||
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
|
||||||
|
|
||||||
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||||
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
|
std::vector<std::string> antiprompt;
|
||||||
|
bool timings_per_token = false;
|
||||||
|
|
||||||
|
struct common_params_sampling sampling;
|
||||||
|
struct common_params_speculative speculative;
|
||||||
|
};
|
|
@ -3,6 +3,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "server.hpp"
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
|
||||||
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||||
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||||
|
|
||||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
|
||||||
enum error_type {
|
|
||||||
ERROR_TYPE_INVALID_REQUEST,
|
|
||||||
ERROR_TYPE_AUTHENTICATION,
|
|
||||||
ERROR_TYPE_SERVER,
|
|
||||||
ERROR_TYPE_NOT_FOUND,
|
|
||||||
ERROR_TYPE_PERMISSION,
|
|
||||||
ERROR_TYPE_UNAVAILABLE, // custom error
|
|
||||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||||
// Fallback null to default value
|
// Fallback null to default value
|
||||||
|
@ -485,18 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct completion_token_output {
|
|
||||||
llama_token tok;
|
|
||||||
std::string text_to_send;
|
|
||||||
|
|
||||||
struct token_prob {
|
|
||||||
llama_token tok;
|
|
||||||
float prob;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<token_prob> probs;
|
|
||||||
};
|
|
||||||
|
|
||||||
// convert a vector of completion_token_output to json
|
// convert a vector of completion_token_output to json
|
||||||
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
||||||
json out = json::array();
|
json out = json::array();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue