server : various fixes
ggml-ci
This commit is contained in:
parent
86a1934978
commit
4e218c7255
2 changed files with 38 additions and 43 deletions
|
@ -34,14 +34,6 @@ endforeach()
|
||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
|
||||||
# clean up generated files in pre-build step
|
|
||||||
foreach(asset ${PUBLIC_ASSETS})
|
|
||||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
|
||||||
add_custom_command(TARGET ${TARGET} PRE_BUILD
|
|
||||||
COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
|
|
||||||
)
|
|
||||||
endforeach()
|
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
|
|
|
@ -122,11 +122,6 @@ struct slot_params {
|
||||||
struct common_params_sampling sampling;
|
struct common_params_sampling sampling;
|
||||||
struct common_params_speculative speculative;
|
struct common_params_speculative speculative;
|
||||||
|
|
||||||
// params only used in to_json()
|
|
||||||
int32_t n_ctx;
|
|
||||||
uint32_t seed_cur;
|
|
||||||
bool can_speculative;
|
|
||||||
|
|
||||||
// OAI-compat fields
|
// OAI-compat fields
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool oaicompat = false;
|
bool oaicompat = false;
|
||||||
|
@ -134,7 +129,7 @@ struct slot_params {
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
|
|
||||||
json to_json() {
|
json to_json() const {
|
||||||
std::vector<std::string> samplers;
|
std::vector<std::string> samplers;
|
||||||
samplers.reserve(sampling.samplers.size());
|
samplers.reserve(sampling.samplers.size());
|
||||||
for (const auto & sampler : sampling.samplers) {
|
for (const auto & sampler : sampling.samplers) {
|
||||||
|
@ -142,7 +137,6 @@ struct slot_params {
|
||||||
}
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_ctx", n_ctx},
|
|
||||||
{"n_predict", n_predict}, // Server configured n_predict
|
{"n_predict", n_predict}, // Server configured n_predict
|
||||||
{"temperature", sampling.temp},
|
{"temperature", sampling.temp},
|
||||||
{"dynatemp_range", sampling.dynatemp_range},
|
{"dynatemp_range", sampling.dynatemp_range},
|
||||||
|
@ -177,7 +171,6 @@ struct slot_params {
|
||||||
{"min_keep", sampling.min_keep},
|
{"min_keep", sampling.min_keep},
|
||||||
{"grammar", sampling.grammar},
|
{"grammar", sampling.grammar},
|
||||||
{"samplers", samplers},
|
{"samplers", samplers},
|
||||||
{"speculative", can_speculative},
|
|
||||||
{"speculative.n_max", speculative.n_max},
|
{"speculative.n_max", speculative.n_max},
|
||||||
{"speculative.n_min", speculative.n_min},
|
{"speculative.n_min", speculative.n_min},
|
||||||
{"speculative.p_min", speculative.p_min},
|
{"speculative.p_min", speculative.p_min},
|
||||||
|
@ -483,12 +476,6 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
return std::vector<json>({initial_ret, second_ret});
|
return std::vector<json>({initial_ret, second_ret});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
|
||||||
// with empty content, we ignore these at the calee site.
|
|
||||||
if (content.empty()) {
|
|
||||||
return std::vector<json>({json::object()});
|
|
||||||
}
|
|
||||||
|
|
||||||
choices = json::array({json{
|
choices = json::array({json{
|
||||||
{"finish_reason", nullptr},
|
{"finish_reason", nullptr},
|
||||||
{"index", 0},
|
{"index", 0},
|
||||||
|
@ -722,6 +709,7 @@ struct server_slot {
|
||||||
|
|
||||||
llama_batch batch_spec;
|
llama_batch batch_spec;
|
||||||
|
|
||||||
|
llama_context * ctx = nullptr;
|
||||||
llama_context * ctx_dft = nullptr;
|
llama_context * ctx_dft = nullptr;
|
||||||
|
|
||||||
common_speculative * spec = nullptr;
|
common_speculative * spec = nullptr;
|
||||||
|
@ -906,6 +894,27 @@ struct server_slot {
|
||||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json to_json() const {
|
||||||
|
return json {
|
||||||
|
{"id", id},
|
||||||
|
{"id_task", id_task},
|
||||||
|
{"n_ctx", n_ctx},
|
||||||
|
{"speculative", can_speculate()},
|
||||||
|
{"is_processing", is_processing()},
|
||||||
|
{"params", params.to_json()},
|
||||||
|
{"prompt", common_detokenize(ctx, prompt_tokens)},
|
||||||
|
{"next_token",
|
||||||
|
{
|
||||||
|
{"has_next_token", has_next_token},
|
||||||
|
{"has_new_line", has_new_line},
|
||||||
|
{"n_remain", n_remaining},
|
||||||
|
{"n_decoded", n_decoded},
|
||||||
|
{"stopping_word", stopping_word},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_metrics {
|
struct server_metrics {
|
||||||
|
@ -1338,6 +1347,7 @@ struct server_context {
|
||||||
server_slot slot;
|
server_slot slot;
|
||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
|
slot.ctx = ctx;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
slot.n_predict = params_base.n_predict;
|
slot.n_predict = params_base.n_predict;
|
||||||
|
|
||||||
|
@ -1370,7 +1380,7 @@ struct server_context {
|
||||||
slots.push_back(slot);
|
slots.push_back(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
default_generation_settings_for_props = slots[0].params.to_json();
|
default_generation_settings_for_props = slots[0].to_json();
|
||||||
default_generation_settings_for_props["seed"] = -1;
|
default_generation_settings_for_props["seed"] = -1;
|
||||||
|
|
||||||
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
|
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
|
||||||
|
@ -1848,17 +1858,18 @@ struct server_context {
|
||||||
queue_results.send(std::move(res));
|
queue_results.send(std::move(res));
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_partial_response(server_slot & slot, completion_token_output tkn) {
|
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
|
||||||
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
||||||
res->id = slot.id_task;
|
|
||||||
res->index = slot.index;
|
res->id = slot.id_task;
|
||||||
res->content = tkn.text_to_send;
|
res->index = slot.index;
|
||||||
|
res->content = tkn.text_to_send;
|
||||||
|
|
||||||
res->truncated = slot.truncated;
|
res->truncated = slot.truncated;
|
||||||
res->n_decoded = slot.n_decoded;
|
res->n_decoded = slot.n_decoded;
|
||||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||||
|
|
||||||
res->stop = slot.stop;
|
res->stop = slot.stop;
|
||||||
|
|
||||||
res->verbose = slot.params.verbose;
|
res->verbose = slot.params.verbose;
|
||||||
res->oaicompat = slot.params.oaicompat;
|
res->oaicompat = slot.params.oaicompat;
|
||||||
|
@ -1869,6 +1880,7 @@ struct server_context {
|
||||||
// populate res.probs_output
|
// populate res.probs_output
|
||||||
if (slot.params.sampling.n_probs > 0) {
|
if (slot.params.sampling.n_probs > 0) {
|
||||||
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||||
|
|
||||||
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
||||||
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
||||||
|
|
||||||
|
@ -1891,7 +1903,8 @@ struct server_context {
|
||||||
void send_final_response(server_slot & slot) {
|
void send_final_response(server_slot & slot) {
|
||||||
if (slot.params.stream) {
|
if (slot.params.stream) {
|
||||||
// if in stream mode, send the last partial response
|
// if in stream mode, send the last partial response
|
||||||
return send_partial_response(slot, {0, "", {}});
|
send_partial_response(slot, {0, "", {}});
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto res = std::make_unique<server_task_result_cmpl_final>();
|
auto res = std::make_unique<server_task_result_cmpl_final>();
|
||||||
|
@ -2012,6 +2025,7 @@ struct server_context {
|
||||||
std::vector<server_task> tasks;
|
std::vector<server_task> tasks;
|
||||||
auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
|
auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
|
||||||
SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
|
SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
|
||||||
|
|
||||||
server_task task;
|
server_task task;
|
||||||
task.id = queue_tasks.get_new_id();
|
task.id = queue_tasks.get_new_id();
|
||||||
task.inf_type = inf_type;
|
task.inf_type = inf_type;
|
||||||
|
@ -2205,18 +2219,7 @@ struct server_context {
|
||||||
int n_processing_slots = 0;
|
int n_processing_slots = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
json slot_data = slot.params.to_json();
|
json slot_data = slot.to_json();
|
||||||
slot_data["id"] = slot.id;
|
|
||||||
slot_data["id_task"] = slot.id_task;
|
|
||||||
slot_data["is_processing"] = slot.is_processing();
|
|
||||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
|
||||||
slot_data["next_token"] = {
|
|
||||||
{"has_next_token", slot.has_next_token},
|
|
||||||
{"has_new_line", slot.has_new_line},
|
|
||||||
{"n_remain", slot.n_remaining},
|
|
||||||
{"n_decoded", slot.n_decoded},
|
|
||||||
{"stopping_word", slot.stopping_word},
|
|
||||||
};
|
|
||||||
|
|
||||||
if (slot.is_processing()) {
|
if (slot.is_processing()) {
|
||||||
n_processing_slots++;
|
n_processing_slots++;
|
||||||
|
@ -3003,11 +3006,11 @@ int main(int argc, char ** argv) {
|
||||||
res.status = 200;
|
res.status = 200;
|
||||||
};
|
};
|
||||||
|
|
||||||
svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
|
svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
|
||||||
std::string message;
|
std::string message;
|
||||||
try {
|
try {
|
||||||
std::rethrow_exception(ep);
|
std::rethrow_exception(ep);
|
||||||
} catch (std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
message = e.what();
|
message = e.what();
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
message = "Unknown Exception";
|
message = "Unknown Exception";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue