server : refactoring (wip)

This commit is contained in:
Georgi Gerganov 2024-03-05 11:16:43 +02:00
parent 6a87ac3a52
commit f4e6e7e61f
No known key found for this signature in database
GPG key ID: BF970631944C16B7
2 changed files with 190 additions and 461 deletions

View file

@ -1,13 +1,9 @@
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#include "utils.hpp" #include "utils.hpp"
#include "oai.hpp" #include "oai.hpp"
#include "../llava/clip.h" #include "common.h"
#include "../llava/llava.h" #include "llama.h"
#include "grammar-parser.h"
#include "stb_image.h"
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@ -33,19 +29,6 @@
using json = nlohmann::json; using json = nlohmann::json;
struct server_params {
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
bool metrics_endpoint = false;
int n_threads_http = -1;
};
bool server_verbose = false; bool server_verbose = false;
bool server_log_json = true; bool server_log_json = true;
@ -80,16 +63,20 @@ struct slot_params {
json input_suffix; json input_suffix;
}; };
struct slot_image { struct server_params {
int32_t id; int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
int32_t n_threads_http = -1;
bool request_encode_image = false; std::string hostname = "127.0.0.1";
float * image_embedding = nullptr; std::string public_path = "examples/server/public";
int32_t image_tokens = 0; std::string chat_template = "";
clip_image_u8 * img_data; std::vector<std::string> api_keys;
std::string prefix_prompt; // before of this image bool slots_endpoint = true;
bool metrics_endpoint = false;
}; };
struct server_slot { struct server_slot {
@ -130,8 +117,8 @@ struct server_slot {
bool stopped_limit = false; bool stopped_limit = false;
bool oaicompat = false; bool oaicompat = false;
std::string oaicompat_model;
std::string oaicompat_model;
std::string stopping_word; std::string stopping_word;
// sampling // sampling
@ -144,9 +131,6 @@ struct server_slot {
int32_t n_past_se = 0; // self-extend int32_t n_past_se = 0; // self-extend
// multimodal
std::vector<slot_image> images;
// stats // stats
size_t n_sent_text = 0; // number of sent text character size_t n_sent_text = 0; // number of sent text character
size_t n_sent_token_probs = 0; size_t n_sent_token_probs = 0;
@ -176,16 +160,6 @@ struct server_slot {
n_past_se = 0; n_past_se = 0;
generated_token_probs.clear(); generated_token_probs.clear();
for (slot_image & img : images) {
free(img.image_embedding);
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
images.clear();
} }
bool has_budget(gpt_params &global_params) { bool has_budget(gpt_params &global_params) {
@ -221,16 +195,14 @@ struct server_slot {
} }
void release() { void release() {
if (state == PROCESSING) if (state == PROCESSING) {
{
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE; command = RELEASE;
} }
} }
json get_formated_timings() { json get_formated_timings() {
return json return json {
{
{"prompt_n", n_prompt_tokens_processed}, {"prompt_n", n_prompt_tokens_processed},
{"prompt_ms", t_prompt_processing}, {"prompt_ms", t_prompt_processing},
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
@ -245,11 +217,14 @@ struct server_slot {
void print_timings() const { void print_timings() const {
char buffer[512]; char buffer[512];
double t_token = t_prompt_processing / n_prompt_tokens_processed; double t_token = t_prompt_processing / n_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed, t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second); t_token, n_tokens_second);
LOG_INFO(buffer, { LOG_INFO(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
@ -261,9 +236,11 @@ struct server_slot {
t_token = t_token_generation / n_decoded; t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded; n_tokens_second = 1e3 / t_token_generation * n_decoded;
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded, t_token_generation, n_decoded,
t_token, n_tokens_second); t_token, n_tokens_second);
LOG_INFO(buffer, { LOG_INFO(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
@ -273,7 +250,8 @@ struct server_slot {
{"n_tokens_second", n_tokens_second}, {"n_tokens_second", n_tokens_second},
}); });
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_INFO(buffer, { LOG_INFO(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
@ -315,18 +293,14 @@ struct server_metrics {
} }
}; };
struct llama_server_context struct llama_server_context {
{
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
clip_ctx *clp_ctx = nullptr;
gpt_params params; gpt_params params;
llama_batch batch; llama_batch batch;
bool multimodal = false;
bool clean_kv_cache = true; bool clean_kv_cache = true;
bool all_slots_are_idle = false; bool all_slots_are_idle = false;
bool add_bos_token = true; bool add_bos_token = true;
@ -351,36 +325,20 @@ struct llama_server_context
server_metrics metrics; server_metrics metrics;
~llama_server_context() ~llama_server_context() {
{ if (ctx) {
if (ctx)
{
llama_free(ctx); llama_free(ctx);
ctx = nullptr; ctx = nullptr;
} }
if (model)
{ if (model) {
llama_free_model(model); llama_free_model(model);
model = nullptr; model = nullptr;
} }
} }
bool load_model(const gpt_params &params_) bool load_model(const gpt_params & params_) {
{
params = params_; params = params_;
if (!params.mmproj.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
return false;
}
if (params.n_ctx < 2048) { // request larger context for the image embedding
params.n_ctx = 2048;
}
}
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr) if (model == nullptr)
@ -389,17 +347,6 @@ struct llama_server_context
return false; return false;
} }
if (multimodal) {
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx);
llama_free_model(model);
return false;
}
}
n_ctx = llama_n_ctx(ctx); n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model); add_bos_token = llama_should_add_bos_token(model);
@ -522,15 +469,12 @@ struct llama_server_context
int64_t t_last = ggml_time_us(); int64_t t_last = ggml_time_us();
server_slot * last_used = nullptr; server_slot * last_used = nullptr;
for (server_slot & slot : slots) for (server_slot & slot : slots) {
{ if (slot.id == id && slot.available()) {
if (slot.id == id && slot.available())
{
return &slot; return &slot;
} }
if (slot.available() && slot.t_last_used < t_last) if (slot.available() && slot.t_last_used < t_last) {
{
last_used = &slot; last_used = &slot;
t_last = slot.t_last_used; t_last = slot.t_last_used;
} }
@ -539,82 +483,82 @@ struct llama_server_context
return last_used; return last_used;
} }
bool launch_slot_with_data(server_slot* &slot, json data) { bool launch_slot_with_data(server_slot & slot, json data) {
slot_params default_params; slot_params default_params;
llama_sampling_params default_sparams; llama_sampling_params default_sparams;
if (data.count("__oaicompat") != 0) { if (data.count("__oaicompat") != 0) {
slot->oaicompat = true; slot.oaicompat = true;
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else { } else {
slot->oaicompat = false; slot.oaicompat = false;
slot->oaicompat_model = ""; slot.oaicompat_model = "";
} }
slot->params.stream = json_value(data, "stream", false); slot.params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot.params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed); slot.params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
// Might be better to reject the request with a 400 ? // Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", { LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict}, {"params.n_predict", slot.params.n_predict},
{"slot.n_predict", slot->n_predict}, {"slot.n_predict", slot.n_predict},
}); });
slot->params.n_predict = slot->n_predict; slot.params.n_predict = slot.n_predict;
} }
// infill // infill
if (data.count("input_prefix") != 0) if (data.count("input_prefix") != 0)
{ {
slot->params.input_prefix = data["input_prefix"]; slot.params.input_prefix = data["input_prefix"];
} }
else else
{ {
slot->params.input_prefix = ""; slot.params.input_prefix = "";
} }
if (data.count("input_suffix") != 0) if (data.count("input_suffix") != 0)
{ {
slot->params.input_suffix = data["input_suffix"]; slot.params.input_suffix = data["input_suffix"];
} }
else else
{ {
slot->params.input_suffix = ""; slot.params.input_suffix = "";
} }
if (data.count("prompt") != 0) if (data.count("prompt") != 0)
{ {
slot->prompt = data["prompt"]; slot.prompt = data["prompt"];
} }
else else
{ {
slot->prompt = ""; slot.prompt = "";
} }
slot->sparams.penalty_prompt_tokens.clear(); slot.sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false; slot.sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt"); const auto &penalty_prompt = data.find("penalty_prompt");
if (penalty_prompt != data.end()) if (penalty_prompt != data.end())
{ {
@ -622,17 +566,17 @@ struct llama_server_context
{ {
const auto penalty_prompt_string = penalty_prompt->get<std::string>(); const auto penalty_prompt_string = penalty_prompt->get<std::string>();
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false); auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); slot.sparams.penalty_prompt_tokens.swap(penalty_tokens);
if (slot->params.n_predict > 0) if (slot.params.n_predict > 0)
{ {
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict); slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
} }
slot->sparams.use_penalty_prompt_tokens = true; slot.sparams.use_penalty_prompt_tokens = true;
} }
else if (penalty_prompt->is_array()) else if (penalty_prompt->is_array())
{ {
const auto n_tokens = penalty_prompt->size(); const auto n_tokens = penalty_prompt->size();
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict)); slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
for (const auto &penalty_token : *penalty_prompt) for (const auto &penalty_token : *penalty_prompt)
{ {
@ -641,19 +585,19 @@ struct llama_server_context
const auto tok = penalty_token.get<llama_token>(); const auto tok = penalty_token.get<llama_token>();
if (tok >= 0 && tok < n_vocab) if (tok >= 0 && tok < n_vocab)
{ {
slot->sparams.penalty_prompt_tokens.push_back(tok); slot.sparams.penalty_prompt_tokens.push_back(tok);
} }
} }
} }
slot->sparams.use_penalty_prompt_tokens = true; slot.sparams.use_penalty_prompt_tokens = true;
} }
} }
slot->sparams.logit_bias.clear(); slot.sparams.logit_bias.clear();
if (json_value(data, "ignore_eos", false)) if (json_value(data, "ignore_eos", false))
{ {
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
} }
const auto & logit_bias = data.find("logit_bias"); const auto & logit_bias = data.find("logit_bias");
@ -683,7 +627,7 @@ struct llama_server_context
llama_token tok = el[0].get<llama_token>(); llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) if (tok >= 0 && tok < n_vocab)
{ {
slot->sparams.logit_bias[tok] = bias; slot.sparams.logit_bias[tok] = bias;
} }
} }
else if (el[0].is_string()) else if (el[0].is_string())
@ -691,14 +635,14 @@ struct llama_server_context
auto toks = llama_tokenize(model, el[0].get<std::string>(), false); auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) for (auto tok : toks)
{ {
slot->sparams.logit_bias[tok] = bias; slot.sparams.logit_bias[tok] = bias;
} }
} }
} }
} }
} }
slot->params.antiprompt.clear(); slot.params.antiprompt.clear();
const auto &stop = data.find("stop"); const auto &stop = data.find("stop");
if (stop != data.end() && stop->is_array()) if (stop != data.end() && stop->is_array())
@ -707,7 +651,7 @@ struct llama_server_context
{ {
if (!word.empty()) if (!word.empty())
{ {
slot->params.antiprompt.push_back(word); slot.params.antiprompt.push_back(word);
} }
} }
} }
@ -723,99 +667,26 @@ struct llama_server_context
sampler_names.emplace_back(sampler_name); sampler_names.emplace_back(sampler_name);
} }
} }
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
} }
else else
{ {
slot->sparams.samplers_sequence = default_sparams.samplers_sequence; slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
} }
if (multimodal) if (slot.ctx_sampling != nullptr)
{ {
const auto &images_data = data.find("image_data"); llama_sampling_free(slot.ctx_sampling);
if (images_data != data.end() && images_data->is_array())
{
for (const auto &img : *images_data)
{
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_ERROR("failed to load image", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
return false;
} }
LOG_VERBOSE("image loaded", { slot.ctx_sampling = llama_sampling_init(slot.sparams);
{"slot_id", slot->id}, llama_set_rng_seed(ctx, slot.params.seed);
{"img_sl_id", img_sl.id} slot.command = LOAD_PROMPT;
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
// process prompt
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
if (slot->images.size() > 0 && !slot->prompt.is_array())
{
std::string prompt = slot->prompt.get<std::string>();
size_t pos = 0, begin_prefix = 0;
std::string pattern = "[img-";
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t end_prefix = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos != std::string::npos)
{
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
begin_prefix = end_pos + 1;
break;
}
}
if (!found) {
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG_TEE("Invalid image number id in prompt\n");
slot->images.clear();
return false;
}
}
}
slot->prompt = "";
slot->params.input_suffix = prompt.substr(begin_prefix);
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
}
}
}
if (slot->ctx_sampling != nullptr)
{
llama_sampling_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;
all_slots_are_idle = false; all_slots_are_idle = false;
LOG_INFO("slot is processing task", { LOG_INFO("slot is processing task", {
{"slot_id", slot->id}, {"slot_id", slot.id},
{"task_id", slot->task_id}, {"task_id", slot.task_id},
}); });
return true; return true;
@ -1038,27 +909,6 @@ struct llama_server_context
return slot.has_next_token; // continue return slot.has_next_token; // continue
} }
bool process_images(server_slot &slot) const
{
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
{
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image");
return false;
}
img.request_encode_image = false;
}
return slot.images.size() > 0;
}
void send_error(task_server& task, const std::string &error) void send_error(task_server& task, const std::string &error)
{ {
LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
@ -1074,11 +924,10 @@ struct llama_server_context
json get_formated_generation(server_slot &slot) json get_formated_generation(server_slot &slot)
{ {
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence; std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence) for (const auto & sampler_type : slot.sparams.samplers_sequence) {
{
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
} }
@ -1131,7 +980,7 @@ struct llama_server_context
{"content", tkn.text_to_send}, {"content", tkn.text_to_send},
{"stop", false}, {"stop", false},
{"slot_id", slot.id}, {"slot_id", slot.id},
{"multimodal", multimodal} {"multimodal", false}
}; };
if (slot.sparams.n_probs > 0) if (slot.sparams.n_probs > 0)
@ -1299,84 +1148,6 @@ struct llama_server_context
} }
} }
// for multiple images processing
bool ingest_images(server_slot &slot, int n_batch)
{
int image_idx = 0;
while (image_idx < (int) slot.images.size())
{
slot_image &img = slot.images[image_idx];
// process prefix prompt
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
if (llama_decode(ctx, batch_view))
{
LOG_TEE("%s : failed to eval\n", __func__);
return false;
}
}
// process image with llm
for (int i = 0; i < img.image_tokens; i += n_batch)
{
int n_eval = img.image_tokens - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
const int n_embd = llama_n_embd(model);
llama_batch batch_img = {
n_eval,
nullptr,
(img.image_embedding + i * n_embd),
nullptr,
nullptr,
nullptr,
nullptr,
slot.n_past,
1, 0
};
if (llama_decode(ctx, batch_img))
{
LOG_TEE("%s : failed to eval image\n", __func__);
return false;
}
slot.n_past += n_eval;
}
image_idx++;
llama_batch_clear(batch);
// append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1;
}
}
return true;
}
void request_cancel(int task_id) void request_cancel(int task_id)
{ {
task_server task; task_server task;
@ -1452,7 +1223,7 @@ struct llama_server_context
slot->task_id = task.id; slot->task_id = task.id;
slot->multitask_id = task.multitask_id; slot->multitask_id = task.multitask_id;
if (!launch_slot_with_data(slot, task.data)) if (!launch_slot_with_data(*slot, task.data))
{ {
// send error result // send error result
send_error(task, "internal_error"); send_error(task, "internal_error");
@ -1670,7 +1441,7 @@ struct llama_server_context
{ {
for (auto & slot : slots) for (auto & slot : slots)
{ {
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty(); const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
// empty prompt passed -> release the slot and send empty response // empty prompt passed -> release the slot and send empty response
// note: infill mode allows empty prompt // note: infill mode allows empty prompt
@ -1737,6 +1508,7 @@ struct llama_server_context
std::vector<llama_token> new_tokens( std::vector<llama_token> new_tokens(
prompt_tokens.begin(), prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep); prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert( new_tokens.insert(
new_tokens.end(), new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
@ -1762,6 +1534,7 @@ struct llama_server_context
slot.n_past = 0; slot.n_past = 0;
slot.n_past_se = 0; slot.n_past_se = 0;
slot.ga_i = 0; slot.ga_i = 0;
slot.n_prompt_tokens_processed = slot.n_prompt_tokens; slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
} }
else else
@ -1828,13 +1601,14 @@ struct llama_server_context
} }
} }
int p0 = (int) system_tokens.size() + slot.n_past; const int p0 = (int) system_tokens.size() + slot.n_past;
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_INFO("kv cache rm [p0, end)", { LOG_INFO("kv cache rm [p0, end)", {
{ "slot_id", slot.id }, { "slot_id", slot.id },
{ "task_id", slot.task_id }, { "task_id", slot.task_id },
{ "p0", p0 } { "p0", p0 }
}); });
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past}, {"n_past", slot.n_past},
@ -1842,10 +1616,7 @@ struct llama_server_context
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
}); });
const bool has_images = process_images(slot); std::vector<llama_token> prefix_tokens = prompt_tokens;
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
@ -1867,18 +1638,6 @@ struct llama_server_context
slot_npast++; slot_npast++;
} }
if (has_images && !ingest_images(slot, n_batch))
{
LOG_ERROR("failed processing images", {
{"slot_id", slot.id},
{"task_id", slot.task_id},
});
// FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value
return false;
}
// extract the logits only for the last token // extract the logits only for the last token
if (batch.n_tokens > 0) if (batch.n_tokens > 0)
{ {
@ -1964,7 +1723,7 @@ struct llama_server_context
for (auto & slot : slots) for (auto & slot : slots)
{ {
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) if (slot.state != PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
{ {
continue; continue;
} }
@ -2105,7 +1864,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" KV cache data type for K (default: f16)\n"); printf(" KV cache data type for K (default: f16)\n");
printf(" -ctv TYPE, --cache-type-v TYPE\n"); printf(" -ctv TYPE, --cache-type-v TYPE\n");
printf(" KV cache data type for V (default: f16)\n"); printf(" KV cache data type for V (default: f16)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-format log output format: json or text (default: json)\n"); printf(" --log-format log output format: json or text (default: json)\n");
printf(" --log-disable disables logging to a file.\n"); printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
@ -2563,15 +2321,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (arg == "-ctv" || arg == "--cache-type-v") { else if (arg == "-ctv" || arg == "--cache-type-v") {
params.cache_type_v = argv[++i]; params.cache_type_v = argv[++i];
} }
else if(arg == "--mmproj")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.mmproj = argv[i];
}
else if (arg == "--log-format") else if (arg == "--log-format")
{ {
if (++i >= argc) if (++i >= argc)
@ -2693,7 +2442,7 @@ static json format_partial_response(
{"content", content}, {"content", content},
{"stop", false}, {"stop", false},
{"slot_id", slot->id}, {"slot_id", slot->id},
{"multimodal", llama.multimodal } {"multimodal", false},
}; };
if (slot->sparams.n_probs > 0) if (slot->sparams.n_probs > 0)
@ -3405,19 +3154,10 @@ int main(int argc, char **argv)
prompt = ""; prompt = "";
} }
json image_data;
if (body.count("image_data") != 0) {
image_data = body["image_data"];
}
else
{
image_data = "";
}
// create and queue the task // create and queue the task
const int task_id = llama.queue_tasks.get_new_id(); const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id); llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1); llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
// get the result // get the result
task_result result = llama.queue_results.recv(task_id); task_result result = llama.queue_results.recv(task_id);
@ -3487,18 +3227,6 @@ int main(int argc, char **argv)
return res.set_content(root.dump(), "application/json; charset=utf-8"); return res.set_content(root.dump(), "application/json; charset=utf-8");
}); });
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
// "Bus error: 10" - this is on macOS, it does not crash on Linux
//std::thread t2([&]()
/*{
bool running = true;
while (running)
{
running = llama.update_slots();
}
}*/
//);
if (sparams.n_threads_http < 1) { if (sparams.n_threads_http < 1) {
// +2 threads for monitoring endpoints // +2 threads for monitoring endpoints
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);

View file

@ -1,5 +1,10 @@
#pragma once #pragma once
#include "llama.h"
#include "common.h"
#include "json.hpp"
#include <string> #include <string>
#include <vector> #include <vector>
#include <set> #include <set>
@ -7,10 +12,6 @@
#include <condition_variable> #include <condition_variable>
#include <unordered_map> #include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json; using json = nlohmann::json;
extern bool server_verbose; extern bool server_verbose;