server : refactoring (wip)

This commit is contained in:
Georgi Gerganov 2024-03-05 11:16:43 +02:00
parent 6a87ac3a52
commit f4e6e7e61f
No known key found for this signature in database
GPG key ID: BF970631944C16B7
2 changed files with 190 additions and 461 deletions

View file

@ -1,13 +1,9 @@
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#include "utils.hpp"
#include "oai.hpp"
#include "../llava/clip.h"
#include "../llava/llava.h"
#include "stb_image.h"
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@ -33,19 +29,6 @@
using json = nlohmann::json;
struct server_params {
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
bool metrics_endpoint = false;
int n_threads_http = -1;
};
bool server_verbose = false;
bool server_log_json = true;
@ -80,16 +63,20 @@ struct slot_params {
json input_suffix;
};
struct slot_image {
int32_t id;
struct server_params {
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
int32_t n_threads_http = -1;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
std::string chat_template = "";
clip_image_u8 * img_data;
std::vector<std::string> api_keys;
std::string prefix_prompt; // before of this image
bool slots_endpoint = true;
bool metrics_endpoint = false;
};
struct server_slot {
@ -130,8 +117,8 @@ struct server_slot {
bool stopped_limit = false;
bool oaicompat = false;
std::string oaicompat_model;
std::string oaicompat_model;
std::string stopping_word;
// sampling
@ -144,9 +131,6 @@ struct server_slot {
int32_t n_past_se = 0; // self-extend
// multimodal
std::vector<slot_image> images;
// stats
size_t n_sent_text = 0; // number of sent text character
size_t n_sent_token_probs = 0;
@ -176,16 +160,6 @@ struct server_slot {
n_past_se = 0;
generated_token_probs.clear();
for (slot_image & img : images) {
free(img.image_embedding);
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
images.clear();
}
bool has_budget(gpt_params &global_params) {
@ -221,16 +195,14 @@ struct server_slot {
}
void release() {
if (state == PROCESSING)
{
if (state == PROCESSING) {
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE;
}
}
json get_formated_timings() {
return json
{
return json {
{"prompt_n", n_prompt_tokens_processed},
{"prompt_ms", t_prompt_processing},
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
@ -245,11 +217,14 @@ struct server_slot {
void print_timings() const {
char buffer[512];
double t_token = t_prompt_processing / n_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
@ -261,9 +236,11 @@ struct server_slot {
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
@ -273,7 +250,8 @@ struct server_slot {
{"n_tokens_second", n_tokens_second},
});
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
@ -315,18 +293,14 @@ struct server_metrics {
}
};
struct llama_server_context
{
struct llama_server_context {
llama_model * model = nullptr;
llama_context * ctx = nullptr;
clip_ctx *clp_ctx = nullptr;
gpt_params params;
llama_batch batch;
bool multimodal = false;
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
@ -351,36 +325,20 @@ struct llama_server_context
server_metrics metrics;
~llama_server_context()
{
if (ctx)
{
~llama_server_context() {
if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
if (model)
{
if (model) {
llama_free_model(model);
model = nullptr;
}
}
bool load_model(const gpt_params &params_)
{
bool load_model(const gpt_params & params_) {
params = params_;
if (!params.mmproj.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
return false;
}
if (params.n_ctx < 2048) { // request larger context for the image embedding
params.n_ctx = 2048;
}
}
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr)
@ -389,17 +347,6 @@ struct llama_server_context
return false;
}
if (multimodal) {
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx);
llama_free_model(model);
return false;
}
}
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
@ -522,15 +469,12 @@ struct llama_server_context
int64_t t_last = ggml_time_us();
server_slot * last_used = nullptr;
for (server_slot & slot : slots)
{
if (slot.id == id && slot.available())
{
for (server_slot & slot : slots) {
if (slot.id == id && slot.available()) {
return &slot;
}
if (slot.available() && slot.t_last_used < t_last)
{
if (slot.available() && slot.t_last_used < t_last) {
last_used = &slot;
t_last = slot.t_last_used;
}
@ -539,82 +483,82 @@ struct llama_server_context
return last_used;
}
bool launch_slot_with_data(server_slot* &slot, json data) {
bool launch_slot_with_data(server_slot & slot, json data) {
slot_params default_params;
llama_sampling_params default_sparams;
if (data.count("__oaicompat") != 0) {
slot->oaicompat = true;
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
slot.oaicompat = true;
slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else {
slot->oaicompat = false;
slot->oaicompat_model = "";
slot.oaicompat = false;
slot.oaicompat_model = "";
}
slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
slot.params.seed = json_value(data, "seed", default_params.seed);
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict},
{"slot.n_predict", slot->n_predict},
{"params.n_predict", slot.params.n_predict},
{"slot.n_predict", slot.n_predict},
});
slot->params.n_predict = slot->n_predict;
slot.params.n_predict = slot.n_predict;
}
// infill
if (data.count("input_prefix") != 0)
{
slot->params.input_prefix = data["input_prefix"];
slot.params.input_prefix = data["input_prefix"];
}
else
{
slot->params.input_prefix = "";
slot.params.input_prefix = "";
}
if (data.count("input_suffix") != 0)
{
slot->params.input_suffix = data["input_suffix"];
slot.params.input_suffix = data["input_suffix"];
}
else
{
slot->params.input_suffix = "";
slot.params.input_suffix = "";
}
if (data.count("prompt") != 0)
{
slot->prompt = data["prompt"];
slot.prompt = data["prompt"];
}
else
{
slot->prompt = "";
slot.prompt = "";
}
slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false;
slot.sparams.penalty_prompt_tokens.clear();
slot.sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt");
if (penalty_prompt != data.end())
{
@ -622,17 +566,17 @@ struct llama_server_context
{
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
if (slot->params.n_predict > 0)
slot.sparams.penalty_prompt_tokens.swap(penalty_tokens);
if (slot.params.n_predict > 0)
{
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
}
slot->sparams.use_penalty_prompt_tokens = true;
slot.sparams.use_penalty_prompt_tokens = true;
}
else if (penalty_prompt->is_array())
{
const auto n_tokens = penalty_prompt->size();
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
const int n_vocab = llama_n_vocab(model);
for (const auto &penalty_token : *penalty_prompt)
{
@ -641,19 +585,19 @@ struct llama_server_context
const auto tok = penalty_token.get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.penalty_prompt_tokens.push_back(tok);
slot.sparams.penalty_prompt_tokens.push_back(tok);
}
}
}
slot->sparams.use_penalty_prompt_tokens = true;
slot.sparams.use_penalty_prompt_tokens = true;
}
}
slot->sparams.logit_bias.clear();
slot.sparams.logit_bias.clear();
if (json_value(data, "ignore_eos", false))
{
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
const auto & logit_bias = data.find("logit_bias");
@ -683,7 +627,7 @@ struct llama_server_context
llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.logit_bias[tok] = bias;
slot.sparams.logit_bias[tok] = bias;
}
}
else if (el[0].is_string())
@ -691,14 +635,14 @@ struct llama_server_context
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias[tok] = bias;
slot.sparams.logit_bias[tok] = bias;
}
}
}
}
}
slot->params.antiprompt.clear();
slot.params.antiprompt.clear();
const auto &stop = data.find("stop");
if (stop != data.end() && stop->is_array())
@ -707,7 +651,7 @@ struct llama_server_context
{
if (!word.empty())
{
slot->params.antiprompt.push_back(word);
slot.params.antiprompt.push_back(word);
}
}
}
@ -723,99 +667,26 @@ struct llama_server_context
sampler_names.emplace_back(sampler_name);
}
}
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
}
else
{
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
}
if (multimodal)
if (slot.ctx_sampling != nullptr)
{
const auto &images_data = data.find("image_data");
if (images_data != data.end() && images_data->is_array())
{
for (const auto &img : *images_data)
{
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_ERROR("failed to load image", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
return false;
llama_sampling_free(slot.ctx_sampling);
}
LOG_VERBOSE("image loaded", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
// process prompt
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
if (slot->images.size() > 0 && !slot->prompt.is_array())
{
std::string prompt = slot->prompt.get<std::string>();
size_t pos = 0, begin_prefix = 0;
std::string pattern = "[img-";
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t end_prefix = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos != std::string::npos)
{
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
begin_prefix = end_pos + 1;
break;
}
}
if (!found) {
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG_TEE("Invalid image number id in prompt\n");
slot->images.clear();
return false;
}
}
}
slot->prompt = "";
slot->params.input_suffix = prompt.substr(begin_prefix);
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
}
}
}
if (slot->ctx_sampling != nullptr)
{
llama_sampling_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;
slot.ctx_sampling = llama_sampling_init(slot.sparams);
llama_set_rng_seed(ctx, slot.params.seed);
slot.command = LOAD_PROMPT;
all_slots_are_idle = false;
LOG_INFO("slot is processing task", {
{"slot_id", slot->id},
{"task_id", slot->task_id},
{"slot_id", slot.id},
{"task_id", slot.task_id},
});
return true;
@ -1038,27 +909,6 @@ struct llama_server_context
return slot.has_next_token; // continue
}
bool process_images(server_slot &slot) const
{
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
{
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image");
return false;
}
img.request_encode_image = false;
}
return slot.images.size() > 0;
}
void send_error(task_server& task, const std::string &error)
{
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
@ -1074,11 +924,10 @@ struct llama_server_context
json get_formated_generation(server_slot &slot)
{
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
{
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
}
@ -1131,7 +980,7 @@ struct llama_server_context
{"content", tkn.text_to_send},
{"stop", false},
{"slot_id", slot.id},
{"multimodal", multimodal}
{"multimodal", false}
};
if (slot.sparams.n_probs > 0)
@ -1299,84 +1148,6 @@ struct llama_server_context
}
}
// for multiple images processing
bool ingest_images(server_slot &slot, int n_batch)
{
int image_idx = 0;
while (image_idx < (int) slot.images.size())
{
slot_image &img = slot.images[image_idx];
// process prefix prompt
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
if (llama_decode(ctx, batch_view))
{
LOG_TEE("%s : failed to eval\n", __func__);
return false;
}
}
// process image with llm
for (int i = 0; i < img.image_tokens; i += n_batch)
{
int n_eval = img.image_tokens - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
const int n_embd = llama_n_embd(model);
llama_batch batch_img = {
n_eval,
nullptr,
(img.image_embedding + i * n_embd),
nullptr,
nullptr,
nullptr,
nullptr,
slot.n_past,
1, 0
};
if (llama_decode(ctx, batch_img))
{
LOG_TEE("%s : failed to eval image\n", __func__);
return false;
}
slot.n_past += n_eval;
}
image_idx++;
llama_batch_clear(batch);
// append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1;
}
}
return true;
}
void request_cancel(int task_id)
{
task_server task;
@ -1452,7 +1223,7 @@ struct llama_server_context
slot->task_id = task.id;
slot->multitask_id = task.multitask_id;
if (!launch_slot_with_data(slot, task.data))
if (!launch_slot_with_data(*slot, task.data))
{
// send error result
send_error(task, "internal_error");
@ -1670,7 +1441,7 @@ struct llama_server_context
{
for (auto & slot : slots)
{
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
// empty prompt passed -> release the slot and send empty response
// note: infill mode allows empty prompt
@ -1737,6 +1508,7 @@ struct llama_server_context
std::vector<llama_token> new_tokens(
prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(
new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
@ -1762,6 +1534,7 @@ struct llama_server_context
slot.n_past = 0;
slot.n_past_se = 0;
slot.ga_i = 0;
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
}
else
@ -1828,13 +1601,14 @@ struct llama_server_context
}
}
int p0 = (int) system_tokens.size() + slot.n_past;
const int p0 = (int) system_tokens.size() + slot.n_past;
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_INFO("kv cache rm [p0, end)", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id },
{ "p0", p0 }
});
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past},
@ -1842,10 +1616,7 @@ struct llama_server_context
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
});
const bool has_images = process_images(slot);
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
std::vector<llama_token> prefix_tokens = prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
@ -1867,18 +1638,6 @@ struct llama_server_context
slot_npast++;
}
if (has_images && !ingest_images(slot, n_batch))
{
LOG_ERROR("failed processing images", {
{"slot_id", slot.id},
{"task_id", slot.task_id},
});
// FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value
return false;
}
// extract the logits only for the last token
if (batch.n_tokens > 0)
{
@ -1964,7 +1723,7 @@ struct llama_server_context
for (auto & slot : slots)
{
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
if (slot.state != PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
{
continue;
}
@ -2105,7 +1864,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" KV cache data type for K (default: f16)\n");
printf(" -ctv TYPE, --cache-type-v TYPE\n");
printf(" KV cache data type for V (default: f16)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-format log output format: json or text (default: json)\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
@ -2563,15 +2321,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (arg == "-ctv" || arg == "--cache-type-v") {
params.cache_type_v = argv[++i];
}
else if(arg == "--mmproj")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.mmproj = argv[i];
}
else if (arg == "--log-format")
{
if (++i >= argc)
@ -2693,7 +2442,7 @@ static json format_partial_response(
{"content", content},
{"stop", false},
{"slot_id", slot->id},
{"multimodal", llama.multimodal }
{"multimodal", false},
};
if (slot->sparams.n_probs > 0)
@ -3405,19 +3154,10 @@ int main(int argc, char **argv)
prompt = "";
}
json image_data;
if (body.count("image_data") != 0) {
image_data = body["image_data"];
}
else
{
image_data = "";
}
// create and queue the task
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);
@ -3487,18 +3227,6 @@ int main(int argc, char **argv)
return res.set_content(root.dump(), "application/json; charset=utf-8");
});
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
// "Bus error: 10" - this is on macOS, it does not crash on Linux
//std::thread t2([&]()
/*{
bool running = true;
while (running)
{
running = llama.update_slots();
}
}*/
//);
if (sparams.n_threads_http < 1) {
// +2 threads for monitoring endpoints
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);

View file

@ -1,5 +1,10 @@
#pragma once
#include "llama.h"
#include "common.h"
#include "json.hpp"
#include <string>
#include <vector>
#include <set>
@ -7,10 +12,6 @@
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json;
extern bool server_verbose;