server : code style
This commit is contained in:
parent
b1b3ba886e
commit
f4800d54e7
1 changed files with 102 additions and 120 deletions
|
@ -433,8 +433,7 @@ struct llama_server_context {
|
||||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
||||||
{
|
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
|
@ -444,32 +443,26 @@ struct llama_server_context {
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
|
||||||
if (json_prompt.is_array())
|
if (json_prompt.is_array()) {
|
||||||
{
|
|
||||||
bool first = true;
|
bool first = true;
|
||||||
for (const auto& p : json_prompt)
|
for (const auto & p : json_prompt) {
|
||||||
{
|
if (p.is_string()) {
|
||||||
if (p.is_string())
|
|
||||||
{
|
|
||||||
auto s = p.template get<std::string>();
|
auto s = p.template get<std::string>();
|
||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
if (first) {
|
||||||
{
|
|
||||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
}
|
} else {
|
||||||
else
|
if (first) {
|
||||||
{
|
|
||||||
if (first)
|
|
||||||
{
|
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens.push_back(p.template get<llama_token>());
|
prompt_tokens.push_back(p.template get<llama_token>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -499,7 +492,7 @@ struct llama_server_context {
|
||||||
return last_used;
|
return last_used;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool launch_slot_with_data(server_slot & slot, json data) {
|
bool launch_slot_with_data(server_slot & slot, json data) const {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
|
@ -862,19 +855,6 @@ struct llama_server_context {
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_error(const task_server & task, const std::string & error) {
|
|
||||||
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
|
||||||
|
|
||||||
task_result res;
|
|
||||||
res.id = task.id;
|
|
||||||
res.id_multi = task.id_multi;
|
|
||||||
res.stop = false;
|
|
||||||
res.error = true;
|
|
||||||
res.result_json = { { "content", error } };
|
|
||||||
|
|
||||||
queue_results.send(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
json get_formated_generation(const server_slot & slot) const {
|
json get_formated_generation(const server_slot & slot) const {
|
||||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
@ -921,6 +901,19 @@ struct llama_server_context {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void send_error(const task_server & task, const std::string & error) {
|
||||||
|
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
||||||
|
|
||||||
|
task_result res;
|
||||||
|
res.id = task.id;
|
||||||
|
res.id_multi = task.id_multi;
|
||||||
|
res.stop = false;
|
||||||
|
res.error = true;
|
||||||
|
res.result_json = { { "content", error } };
|
||||||
|
|
||||||
|
queue_results.send(res);
|
||||||
|
}
|
||||||
|
|
||||||
void send_partial_response(server_slot & slot, completion_token_output tkn) {
|
void send_partial_response(server_slot & slot, completion_token_output tkn) {
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.id_task;
|
res.id = slot.id_task;
|
||||||
|
@ -1092,17 +1085,16 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void request_cancel(int id_task)
|
void request_cancel(int id_task) {
|
||||||
{
|
|
||||||
task_server task;
|
task_server task;
|
||||||
task.type = TASK_TYPE_CANCEL;
|
task.type = TASK_TYPE_CANCEL;
|
||||||
task.id_target = id_task;
|
task.id_target = id_task;
|
||||||
|
|
||||||
queue_tasks.post(task);
|
queue_tasks.post(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
void split_multiprompt_task(int id_multi, task_server& multiprompt_task)
|
void split_multiprompt_task(int id_multi, const task_server & multiprompt_task) {
|
||||||
{
|
const int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||||
int prompt_count = multiprompt_task.data.at("prompt").size();
|
|
||||||
if (prompt_count <= 1) {
|
if (prompt_count <= 1) {
|
||||||
send_error(multiprompt_task, "error while handling multiple prompts");
|
send_error(multiprompt_task, "error while handling multiple prompts");
|
||||||
return;
|
return;
|
||||||
|
@ -1110,8 +1102,7 @@ struct llama_server_context {
|
||||||
|
|
||||||
// generate all the ID for subtask
|
// generate all the ID for subtask
|
||||||
std::vector<int> subtask_ids(prompt_count);
|
std::vector<int> subtask_ids(prompt_count);
|
||||||
for (int i = 0; i < prompt_count; i++)
|
for (int i = 0; i < prompt_count; i++) {
|
||||||
{
|
|
||||||
subtask_ids[i] = queue_tasks.get_new_id();
|
subtask_ids[i] = queue_tasks.get_new_id();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1119,8 +1110,7 @@ struct llama_server_context {
|
||||||
queue_tasks.add_multitask(id_multi, subtask_ids);
|
queue_tasks.add_multitask(id_multi, subtask_ids);
|
||||||
|
|
||||||
// add subtasks
|
// add subtasks
|
||||||
for (int i = 0; i < prompt_count; i++)
|
for (int i = 0; i < prompt_count; i++) {
|
||||||
{
|
|
||||||
json subtask_data = multiprompt_task.data;
|
json subtask_data = multiprompt_task.data;
|
||||||
subtask_data["prompt"] = subtask_data["prompt"][i];
|
subtask_data["prompt"] = subtask_data["prompt"][i];
|
||||||
|
|
||||||
|
@ -1129,7 +1119,7 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void process_single_task(task_server & task) {
|
void process_single_task(const task_server & task) {
|
||||||
switch (task.type) {
|
switch (task.type) {
|
||||||
case TASK_TYPE_COMPLETION:
|
case TASK_TYPE_COMPLETION:
|
||||||
{
|
{
|
||||||
|
@ -1145,8 +1135,7 @@ struct llama_server_context {
|
||||||
system_prompt_process(task.data["system_prompt"]);
|
system_prompt_process(task.data["system_prompt"]);
|
||||||
|
|
||||||
// reset cache_tokens for all slots
|
// reset cache_tokens for all slots
|
||||||
for (server_slot & slot : slots)
|
for (server_slot & slot : slots) {
|
||||||
{
|
|
||||||
slot.cache_tokens.clear();
|
slot.cache_tokens.clear();
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
slot.n_past_se = 0;
|
||||||
|
@ -1183,10 +1172,11 @@ struct llama_server_context {
|
||||||
case TASK_TYPE_METRICS:
|
case TASK_TYPE_METRICS:
|
||||||
{
|
{
|
||||||
json slots_data = json::array();
|
json slots_data = json::array();
|
||||||
|
|
||||||
int n_idle_slots = 0;
|
int n_idle_slots = 0;
|
||||||
int n_processing_slots = 0;
|
int n_processing_slots = 0;
|
||||||
|
|
||||||
for (server_slot & slot: slots) {
|
for (server_slot & slot : slots) {
|
||||||
json slot_data = get_formated_generation(slot);
|
json slot_data = get_formated_generation(slot);
|
||||||
slot_data["id"] = slot.id;
|
slot_data["id"] = slot.id;
|
||||||
slot_data["id_task"] = slot.id_task;
|
slot_data["id_task"] = slot.id_task;
|
||||||
|
@ -1201,11 +1191,13 @@ struct llama_server_context {
|
||||||
{"stopped_limit", slot.stopped_limit},
|
{"stopped_limit", slot.stopped_limit},
|
||||||
{"stopping_word", slot.stopping_word},
|
{"stopping_word", slot.stopping_word},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot_data["state"] == IDLE) {
|
if (slot_data["state"] == IDLE) {
|
||||||
n_idle_slots++;
|
n_idle_slots++;
|
||||||
} else {
|
} else {
|
||||||
n_processing_slots++;
|
n_processing_slots++;
|
||||||
}
|
}
|
||||||
|
|
||||||
slots_data.push_back(slot_data);
|
slots_data.push_back(slot_data);
|
||||||
}
|
}
|
||||||
LOG_INFO("slot data", {
|
LOG_INFO("slot data", {
|
||||||
|
@ -1264,7 +1256,7 @@ struct llama_server_context {
|
||||||
result_jsons.push_back(subres.result_json);
|
result_jsons.push_back(subres.result_json);
|
||||||
result.error = result.error && subres.error;
|
result.error = result.error && subres.error;
|
||||||
}
|
}
|
||||||
result.result_json = json{ { "results", result_jsons } };
|
result.result_json = json { { "results", result_jsons } };
|
||||||
|
|
||||||
queue_results.send(result);
|
queue_results.send(result);
|
||||||
}
|
}
|
||||||
|
@ -1316,9 +1308,9 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||||
|
|
||||||
{
|
|
||||||
task_server task;
|
task_server task;
|
||||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||||
task.id_target = -1;
|
task.id_target = -1;
|
||||||
|
@ -1410,22 +1402,21 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// need process the prompt
|
// need process the prompt
|
||||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT) {
|
||||||
{
|
|
||||||
slot.state = PROCESSING;
|
slot.state = PROCESSING;
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
slot.t_start_process_prompt = ggml_time_us();
|
slot.t_start_process_prompt = ggml_time_us();
|
||||||
slot.t_start_genereration = 0;
|
slot.t_start_genereration = 0;
|
||||||
|
|
||||||
if (slot.infill)
|
if (slot.infill) {
|
||||||
{
|
|
||||||
bool suff_rm_leading_spc = true;
|
bool suff_rm_leading_spc = true;
|
||||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
|
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||||
{
|
|
||||||
params.input_suffix.erase(0, 1);
|
params.input_suffix.erase(0, 1);
|
||||||
suff_rm_leading_spc = false;
|
suff_rm_leading_spc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
||||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
||||||
|
|
||||||
|
@ -1440,23 +1431,19 @@ struct llama_server_context {
|
||||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||||
prefix_tokens.push_back(llama_token_middle(model));
|
prefix_tokens.push_back(llama_token_middle(model));
|
||||||
prompt_tokens = prefix_tokens;
|
prompt_tokens = prefix_tokens;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens = prompt_tokens.size();
|
slot.n_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (slot.params.n_keep < 0)
|
if (slot.params.n_keep < 0) {
|
||||||
{
|
|
||||||
slot.params.n_keep = slot.n_prompt_tokens;
|
slot.params.n_keep = slot.n_prompt_tokens;
|
||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
{
|
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
const int n_block_size = n_left / 2;
|
const int n_block_size = n_left / 2;
|
||||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||||
|
@ -1476,6 +1463,7 @@ struct llama_server_context {
|
||||||
{"n_left", n_left},
|
{"n_left", n_left},
|
||||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||||
});
|
});
|
||||||
|
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
|
|
||||||
|
@ -1483,8 +1471,7 @@ struct llama_server_context {
|
||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
if (!slot.params.cache_prompt) {
|
||||||
{
|
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -1492,12 +1479,9 @@ struct llama_server_context {
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (auto & token : prompt_tokens)
|
for (auto & token : prompt_tokens) {
|
||||||
{
|
|
||||||
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1505,21 +1489,18 @@ struct llama_server_context {
|
||||||
|
|
||||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||||
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) {
|
||||||
{
|
|
||||||
slot.n_past -= 1;
|
slot.n_past -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
|
||||||
|
|
||||||
if (slot.ga_n != 1)
|
if (slot.ga_n != 1) {
|
||||||
{
|
|
||||||
int ga_i = 0;
|
int ga_i = 0;
|
||||||
int32_t ga_n = slot.ga_n;
|
int32_t ga_n = slot.ga_n;
|
||||||
int32_t ga_w = slot.ga_w;
|
int32_t ga_w = slot.ga_w;
|
||||||
int32_t slot_npast = 0;
|
int32_t slot_npast = 0;
|
||||||
for (int k = 0; k < slot.n_past; ++k)
|
for (int k = 0; k < slot.n_past; ++k) {
|
||||||
{
|
|
||||||
while (slot_npast >= ga_i + ga_w) {
|
while (slot_npast >= ga_i + ga_w) {
|
||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||||
slot_npast -= bd;
|
slot_npast -= bd;
|
||||||
|
@ -1543,8 +1524,7 @@ struct llama_server_context {
|
||||||
|
|
||||||
slot.cache_tokens = prompt_tokens;
|
slot.cache_tokens = prompt_tokens;
|
||||||
|
|
||||||
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
|
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
|
||||||
{
|
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||||
{ "id_slot", slot.id },
|
{ "id_slot", slot.id },
|
||||||
|
@ -1580,23 +1560,22 @@ struct llama_server_context {
|
||||||
int32_t ga_n = slot.ga_n;
|
int32_t ga_n = slot.ga_n;
|
||||||
int32_t ga_w = slot.ga_w;
|
int32_t ga_w = slot.ga_w;
|
||||||
|
|
||||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) {
|
||||||
{
|
if (slot.ga_n != 1) {
|
||||||
if (slot.ga_n != 1)
|
|
||||||
{
|
|
||||||
while (slot_npast >= ga_i + ga_w) {
|
while (slot_npast >= ga_i + ga_w) {
|
||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||||
slot_npast -= bd;
|
slot_npast -= bd;
|
||||||
ga_i += ga_w/ga_n;
|
ga_i += ga_w/ga_n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
|
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
|
||||||
|
|
||||||
slot_npast++;
|
slot_npast++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
if (batch.n_tokens > 0)
|
if (batch.n_tokens > 0) {
|
||||||
{
|
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1728,12 +1707,12 @@ struct llama_server_context {
|
||||||
|
|
||||||
json model_meta() const {
|
json model_meta() const {
|
||||||
return json {
|
return json {
|
||||||
{"vocab_type", llama_vocab_type(model)},
|
{"vocab_type", llama_vocab_type (model)},
|
||||||
{"n_vocab", llama_n_vocab(model)},
|
{"n_vocab", llama_n_vocab (model)},
|
||||||
{"n_ctx_train", llama_n_ctx_train(model)},
|
{"n_ctx_train", llama_n_ctx_train (model)},
|
||||||
{"n_embd", llama_n_embd(model)},
|
{"n_embd", llama_n_embd (model)},
|
||||||
{"n_params", llama_model_n_params(model)},
|
{"n_params", llama_model_n_params(model)},
|
||||||
{"size", llama_model_size(model)},
|
{"size", llama_model_size (model)},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1756,17 +1735,14 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" --pooling {none,mean,cls}\n");
|
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
|
||||||
printf(" pooling type for embeddings, use model default if unspecified\n");
|
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_supports_mlock())
|
if (llama_supports_mlock()) {
|
||||||
{
|
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_supports_mmap())
|
if (llama_supports_mmap()) {
|
||||||
{
|
|
||||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
||||||
|
@ -2302,8 +2278,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
server_params_parse(argc, argv, sparams, params, llama);
|
server_params_parse(argc, argv, sparams, params, llama);
|
||||||
|
|
||||||
if (params.model_alias == "unknown")
|
if (params.model_alias == "unknown") {
|
||||||
{
|
|
||||||
params.model_alias = params.model;
|
params.model_alias = params.model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2643,6 +2618,7 @@ int main(int argc, char ** argv) {
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
|
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, data, false, false);
|
llama.request_completion(id_task, -1, data, false, false);
|
||||||
|
|
||||||
|
@ -2741,6 +2717,7 @@ int main(int argc, char ** argv) {
|
||||||
json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
|
json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
|
||||||
|
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, data, false, false);
|
llama.request_completion(id_task, -1, data, false, false);
|
||||||
|
|
||||||
|
@ -2821,8 +2798,10 @@ int main(int argc, char ** argv) {
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
|
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, data, true, false);
|
llama.request_completion(id_task, -1, data, true, false);
|
||||||
|
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
task_result result = llama.queue_results.recv(id_task);
|
task_result result = llama.queue_results.recv(id_task);
|
||||||
if (!result.error && result.stop) {
|
if (!result.error && result.stop) {
|
||||||
|
@ -2915,6 +2894,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// create and queue the task
|
// create and queue the task
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||||
|
|
||||||
|
@ -2939,6 +2919,7 @@ int main(int argc, char ** argv) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (const json & elem : prompt) {
|
for (const json & elem : prompt) {
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, { {"prompt", elem}, { "n_predict", 0} }, false, true);
|
llama.request_completion(id_task, -1, { {"prompt", elem}, { "n_predict", 0} }, false, true);
|
||||||
|
|
||||||
|
@ -2965,6 +2946,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// create and queue the task
|
// create and queue the task
|
||||||
const int id_task = llama.queue_tasks.get_new_id();
|
const int id_task = llama.queue_tasks.get_new_id();
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(id_task);
|
llama.queue_results.add_waiting_task_id(id_task);
|
||||||
llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true);
|
llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue