server : refactoring (wip)
This commit is contained in:
parent
6a87ac3a52
commit
f4e6e7e61f
2 changed files with 190 additions and 461 deletions
|
@ -1,13 +1,9 @@
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
#include "oai.hpp"
|
#include "oai.hpp"
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
#include "common.h"
|
||||||
#include "../llava/llava.h"
|
#include "llama.h"
|
||||||
|
#include "grammar-parser.h"
|
||||||
#include "stb_image.h"
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
@ -33,19 +29,6 @@
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
struct server_params {
|
|
||||||
std::string hostname = "127.0.0.1";
|
|
||||||
std::vector<std::string> api_keys;
|
|
||||||
std::string public_path = "examples/server/public";
|
|
||||||
std::string chat_template = "";
|
|
||||||
int32_t port = 8080;
|
|
||||||
int32_t read_timeout = 600;
|
|
||||||
int32_t write_timeout = 600;
|
|
||||||
bool slots_endpoint = true;
|
|
||||||
bool metrics_endpoint = false;
|
|
||||||
int n_threads_http = -1;
|
|
||||||
};
|
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
bool server_log_json = true;
|
bool server_log_json = true;
|
||||||
|
|
||||||
|
@ -80,16 +63,20 @@ struct slot_params {
|
||||||
json input_suffix;
|
json input_suffix;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct slot_image {
|
struct server_params {
|
||||||
int32_t id;
|
int32_t port = 8080;
|
||||||
|
int32_t read_timeout = 600;
|
||||||
|
int32_t write_timeout = 600;
|
||||||
|
int32_t n_threads_http = -1;
|
||||||
|
|
||||||
bool request_encode_image = false;
|
std::string hostname = "127.0.0.1";
|
||||||
float * image_embedding = nullptr;
|
std::string public_path = "examples/server/public";
|
||||||
int32_t image_tokens = 0;
|
std::string chat_template = "";
|
||||||
|
|
||||||
clip_image_u8 * img_data;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string prefix_prompt; // before of this image
|
bool slots_endpoint = true;
|
||||||
|
bool metrics_endpoint = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
|
@ -130,13 +117,13 @@ struct server_slot {
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
|
|
||||||
bool oaicompat = false;
|
bool oaicompat = false;
|
||||||
std::string oaicompat_model;
|
|
||||||
|
|
||||||
|
std::string oaicompat_model;
|
||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
llama_sampling_context *ctx_sampling = nullptr;
|
llama_sampling_context * ctx_sampling = nullptr;
|
||||||
|
|
||||||
int32_t ga_i = 0; // group-attention state
|
int32_t ga_i = 0; // group-attention state
|
||||||
int32_t ga_n = 1; // group-attention factor
|
int32_t ga_n = 1; // group-attention factor
|
||||||
|
@ -144,9 +131,6 @@ struct server_slot {
|
||||||
|
|
||||||
int32_t n_past_se = 0; // self-extend
|
int32_t n_past_se = 0; // self-extend
|
||||||
|
|
||||||
// multimodal
|
|
||||||
std::vector<slot_image> images;
|
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
size_t n_sent_text = 0; // number of sent text character
|
size_t n_sent_text = 0; // number of sent text character
|
||||||
size_t n_sent_token_probs = 0;
|
size_t n_sent_token_probs = 0;
|
||||||
|
@ -176,16 +160,6 @@ struct server_slot {
|
||||||
n_past_se = 0;
|
n_past_se = 0;
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
|
|
||||||
for (slot_image & img : images) {
|
|
||||||
free(img.image_embedding);
|
|
||||||
if (img.img_data) {
|
|
||||||
clip_image_u8_free(img.img_data);
|
|
||||||
}
|
|
||||||
img.prefix_prompt = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
images.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(gpt_params &global_params) {
|
bool has_budget(gpt_params &global_params) {
|
||||||
|
@ -221,16 +195,14 @@ struct server_slot {
|
||||||
}
|
}
|
||||||
|
|
||||||
void release() {
|
void release() {
|
||||||
if (state == PROCESSING)
|
if (state == PROCESSING) {
|
||||||
{
|
|
||||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||||
command = RELEASE;
|
command = RELEASE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_timings() {
|
json get_formated_timings() {
|
||||||
return json
|
return json {
|
||||||
{
|
|
||||||
{"prompt_n", n_prompt_tokens_processed},
|
{"prompt_n", n_prompt_tokens_processed},
|
||||||
{"prompt_ms", t_prompt_processing},
|
{"prompt_ms", t_prompt_processing},
|
||||||
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
|
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
|
||||||
|
@ -245,11 +217,14 @@ struct server_slot {
|
||||||
|
|
||||||
void print_timings() const {
|
void print_timings() const {
|
||||||
char buffer[512];
|
char buffer[512];
|
||||||
|
|
||||||
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||||
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
|
||||||
|
snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_prompt_processing, n_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
|
|
||||||
LOG_INFO(buffer, {
|
LOG_INFO(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
|
@ -261,9 +236,11 @@ struct server_slot {
|
||||||
|
|
||||||
t_token = t_token_generation / n_decoded;
|
t_token = t_token_generation / n_decoded;
|
||||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
|
||||||
|
snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_token_generation, n_decoded,
|
t_token_generation, n_decoded,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
|
|
||||||
LOG_INFO(buffer, {
|
LOG_INFO(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
|
@ -273,7 +250,8 @@ struct server_slot {
|
||||||
{"n_tokens_second", n_tokens_second},
|
{"n_tokens_second", n_tokens_second},
|
||||||
});
|
});
|
||||||
|
|
||||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||||
|
|
||||||
LOG_INFO(buffer, {
|
LOG_INFO(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
|
@ -315,18 +293,14 @@ struct server_metrics {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_server_context
|
struct llama_server_context {
|
||||||
{
|
llama_model * model = nullptr;
|
||||||
llama_model *model = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
|
||||||
|
|
||||||
clip_ctx *clp_ctx = nullptr;
|
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
|
|
||||||
bool multimodal = false;
|
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool all_slots_are_idle = false;
|
bool all_slots_are_idle = false;
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true;
|
||||||
|
@ -351,36 +325,20 @@ struct llama_server_context
|
||||||
|
|
||||||
server_metrics metrics;
|
server_metrics metrics;
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context() {
|
||||||
{
|
if (ctx) {
|
||||||
if (ctx)
|
|
||||||
{
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
ctx = nullptr;
|
ctx = nullptr;
|
||||||
}
|
}
|
||||||
if (model)
|
|
||||||
{
|
if (model) {
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
model = nullptr;
|
model = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_model(const gpt_params ¶ms_)
|
bool load_model(const gpt_params & params_) {
|
||||||
{
|
|
||||||
params = params_;
|
params = params_;
|
||||||
if (!params.mmproj.empty()) {
|
|
||||||
multimodal = true;
|
|
||||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
|
||||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
|
||||||
if(clp_ctx == nullptr) {
|
|
||||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.n_ctx < 2048) { // request larger context for the image embedding
|
|
||||||
params.n_ctx = 2048;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (model == nullptr)
|
if (model == nullptr)
|
||||||
|
@ -389,17 +347,6 @@ struct llama_server_context
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (multimodal) {
|
|
||||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
|
||||||
const int n_embd_llm = llama_n_embd(model);
|
|
||||||
if (n_embd_clip != n_embd_llm) {
|
|
||||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
@ -518,19 +465,16 @@ struct llama_server_context
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
server_slot* get_slot(int id) {
|
server_slot * get_slot(int id) {
|
||||||
int64_t t_last = ggml_time_us();
|
int64_t t_last = ggml_time_us();
|
||||||
server_slot *last_used = nullptr;
|
server_slot * last_used = nullptr;
|
||||||
|
|
||||||
for (server_slot & slot : slots)
|
for (server_slot & slot : slots) {
|
||||||
{
|
if (slot.id == id && slot.available()) {
|
||||||
if (slot.id == id && slot.available())
|
|
||||||
{
|
|
||||||
return &slot;
|
return &slot;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.available() && slot.t_last_used < t_last)
|
if (slot.available() && slot.t_last_used < t_last) {
|
||||||
{
|
|
||||||
last_used = &slot;
|
last_used = &slot;
|
||||||
t_last = slot.t_last_used;
|
t_last = slot.t_last_used;
|
||||||
}
|
}
|
||||||
|
@ -539,82 +483,82 @@ struct llama_server_context
|
||||||
return last_used;
|
return last_used;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool launch_slot_with_data(server_slot* &slot, json data) {
|
bool launch_slot_with_data(server_slot & slot, json data) {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
if (data.count("__oaicompat") != 0) {
|
if (data.count("__oaicompat") != 0) {
|
||||||
slot->oaicompat = true;
|
slot.oaicompat = true;
|
||||||
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
} else {
|
} else {
|
||||||
slot->oaicompat = false;
|
slot.oaicompat = false;
|
||||||
slot->oaicompat_model = "";
|
slot.oaicompat_model = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->params.stream = json_value(data, "stream", false);
|
slot.params.stream = json_value(data, "stream", false);
|
||||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||||
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||||
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||||
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||||
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||||
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||||
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
|
||||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
slot.params.seed = json_value(data, "seed", default_params.seed);
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||||
// Might be better to reject the request with a 400 ?
|
// Might be better to reject the request with a 400 ?
|
||||||
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||||
{"params.n_predict", slot->params.n_predict},
|
{"params.n_predict", slot.params.n_predict},
|
||||||
{"slot.n_predict", slot->n_predict},
|
{"slot.n_predict", slot.n_predict},
|
||||||
});
|
});
|
||||||
slot->params.n_predict = slot->n_predict;
|
slot.params.n_predict = slot.n_predict;
|
||||||
}
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
{
|
{
|
||||||
slot->params.input_prefix = data["input_prefix"];
|
slot.params.input_prefix = data["input_prefix"];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
slot->params.input_prefix = "";
|
slot.params.input_prefix = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data.count("input_suffix") != 0)
|
if (data.count("input_suffix") != 0)
|
||||||
{
|
{
|
||||||
slot->params.input_suffix = data["input_suffix"];
|
slot.params.input_suffix = data["input_suffix"];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
slot->params.input_suffix = "";
|
slot.params.input_suffix = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data.count("prompt") != 0)
|
if (data.count("prompt") != 0)
|
||||||
{
|
{
|
||||||
slot->prompt = data["prompt"];
|
slot.prompt = data["prompt"];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
slot->prompt = "";
|
slot.prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->sparams.penalty_prompt_tokens.clear();
|
slot.sparams.penalty_prompt_tokens.clear();
|
||||||
slot->sparams.use_penalty_prompt_tokens = false;
|
slot.sparams.use_penalty_prompt_tokens = false;
|
||||||
const auto &penalty_prompt = data.find("penalty_prompt");
|
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||||
if (penalty_prompt != data.end())
|
if (penalty_prompt != data.end())
|
||||||
{
|
{
|
||||||
|
@ -622,17 +566,17 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||||
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||||
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
slot.sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
||||||
if (slot->params.n_predict > 0)
|
if (slot.params.n_predict > 0)
|
||||||
{
|
{
|
||||||
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
|
slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
|
||||||
}
|
}
|
||||||
slot->sparams.use_penalty_prompt_tokens = true;
|
slot.sparams.use_penalty_prompt_tokens = true;
|
||||||
}
|
}
|
||||||
else if (penalty_prompt->is_array())
|
else if (penalty_prompt->is_array())
|
||||||
{
|
{
|
||||||
const auto n_tokens = penalty_prompt->size();
|
const auto n_tokens = penalty_prompt->size();
|
||||||
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
|
slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
for (const auto &penalty_token : *penalty_prompt)
|
for (const auto &penalty_token : *penalty_prompt)
|
||||||
{
|
{
|
||||||
|
@ -641,22 +585,22 @@ struct llama_server_context
|
||||||
const auto tok = penalty_token.get<llama_token>();
|
const auto tok = penalty_token.get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
slot->sparams.penalty_prompt_tokens.push_back(tok);
|
slot.sparams.penalty_prompt_tokens.push_back(tok);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slot->sparams.use_penalty_prompt_tokens = true;
|
slot.sparams.use_penalty_prompt_tokens = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->sparams.logit_bias.clear();
|
slot.sparams.logit_bias.clear();
|
||||||
|
|
||||||
if (json_value(data, "ignore_eos", false))
|
if (json_value(data, "ignore_eos", false))
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto &logit_bias = data.find("logit_bias");
|
const auto & logit_bias = data.find("logit_bias");
|
||||||
if (logit_bias != data.end() && logit_bias->is_array())
|
if (logit_bias != data.end() && logit_bias->is_array())
|
||||||
{
|
{
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
@ -683,7 +627,7 @@ struct llama_server_context
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = bias;
|
slot.sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (el[0].is_string())
|
else if (el[0].is_string())
|
||||||
|
@ -691,14 +635,14 @@ struct llama_server_context
|
||||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||||
for (auto tok : toks)
|
for (auto tok : toks)
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = bias;
|
slot.sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->params.antiprompt.clear();
|
slot.params.antiprompt.clear();
|
||||||
|
|
||||||
const auto &stop = data.find("stop");
|
const auto &stop = data.find("stop");
|
||||||
if (stop != data.end() && stop->is_array())
|
if (stop != data.end() && stop->is_array())
|
||||||
|
@ -707,7 +651,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
if (!word.empty())
|
if (!word.empty())
|
||||||
{
|
{
|
||||||
slot->params.antiprompt.push_back(word);
|
slot.params.antiprompt.push_back(word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -723,99 +667,26 @@ struct llama_server_context
|
||||||
sampler_names.emplace_back(sampler_name);
|
sampler_names.emplace_back(sampler_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
|
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (multimodal)
|
if (slot.ctx_sampling != nullptr)
|
||||||
{
|
{
|
||||||
const auto &images_data = data.find("image_data");
|
llama_sampling_free(slot.ctx_sampling);
|
||||||
if (images_data != data.end() && images_data->is_array())
|
|
||||||
{
|
|
||||||
for (const auto &img : *images_data)
|
|
||||||
{
|
|
||||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
|
||||||
|
|
||||||
slot_image img_sl;
|
|
||||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
|
||||||
img_sl.img_data = clip_image_u8_init();
|
|
||||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
|
||||||
{
|
|
||||||
LOG_ERROR("failed to load image", {
|
|
||||||
{"slot_id", slot->id},
|
|
||||||
{"img_sl_id", img_sl.id}
|
|
||||||
});
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
LOG_VERBOSE("image loaded", {
|
slot.ctx_sampling = llama_sampling_init(slot.sparams);
|
||||||
{"slot_id", slot->id},
|
llama_set_rng_seed(ctx, slot.params.seed);
|
||||||
{"img_sl_id", img_sl.id}
|
slot.command = LOAD_PROMPT;
|
||||||
});
|
|
||||||
img_sl.request_encode_image = true;
|
|
||||||
slot->images.push_back(img_sl);
|
|
||||||
}
|
|
||||||
// process prompt
|
|
||||||
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
|
|
||||||
if (slot->images.size() > 0 && !slot->prompt.is_array())
|
|
||||||
{
|
|
||||||
std::string prompt = slot->prompt.get<std::string>();
|
|
||||||
size_t pos = 0, begin_prefix = 0;
|
|
||||||
std::string pattern = "[img-";
|
|
||||||
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
|
|
||||||
size_t end_prefix = pos;
|
|
||||||
pos += pattern.length();
|
|
||||||
size_t end_pos = prompt.find(']', pos);
|
|
||||||
if (end_pos != std::string::npos)
|
|
||||||
{
|
|
||||||
std::string image_id = prompt.substr(pos, end_pos - pos);
|
|
||||||
try
|
|
||||||
{
|
|
||||||
int img_id = std::stoi(image_id);
|
|
||||||
bool found = false;
|
|
||||||
for (slot_image &img : slot->images)
|
|
||||||
{
|
|
||||||
if (img.id == img_id) {
|
|
||||||
found = true;
|
|
||||||
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
|
|
||||||
begin_prefix = end_pos + 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found) {
|
|
||||||
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
|
|
||||||
slot->images.clear();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} catch (const std::invalid_argument& e) {
|
|
||||||
LOG_TEE("Invalid image number id in prompt\n");
|
|
||||||
slot->images.clear();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
slot->prompt = "";
|
|
||||||
slot->params.input_suffix = prompt.substr(begin_prefix);
|
|
||||||
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot->ctx_sampling != nullptr)
|
|
||||||
{
|
|
||||||
llama_sampling_free(slot->ctx_sampling);
|
|
||||||
}
|
|
||||||
slot->ctx_sampling = llama_sampling_init(slot->sparams);
|
|
||||||
llama_set_rng_seed(ctx, slot->params.seed);
|
|
||||||
slot->command = LOAD_PROMPT;
|
|
||||||
|
|
||||||
all_slots_are_idle = false;
|
all_slots_are_idle = false;
|
||||||
|
|
||||||
LOG_INFO("slot is processing task", {
|
LOG_INFO("slot is processing task", {
|
||||||
{"slot_id", slot->id},
|
{"slot_id", slot.id},
|
||||||
{"task_id", slot->task_id},
|
{"task_id", slot.task_id},
|
||||||
});
|
});
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -1038,27 +909,6 @@ struct llama_server_context
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
bool process_images(server_slot &slot) const
|
|
||||||
{
|
|
||||||
for (slot_image &img : slot.images)
|
|
||||||
{
|
|
||||||
if (!img.request_encode_image)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
|
||||||
LOG_TEE("Error processing the given image");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
img.request_encode_image = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return slot.images.size() > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void send_error(task_server& task, const std::string &error)
|
void send_error(task_server& task, const std::string &error)
|
||||||
{
|
{
|
||||||
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
||||||
|
@ -1074,11 +924,10 @@ struct llama_server_context
|
||||||
json get_formated_generation(server_slot &slot)
|
json get_formated_generation(server_slot &slot)
|
||||||
{
|
{
|
||||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
|
||||||
std::vector<std::string> samplers_sequence;
|
std::vector<std::string> samplers_sequence;
|
||||||
for (const auto &sampler_type : slot.sparams.samplers_sequence)
|
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
||||||
{
|
|
||||||
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1131,7 +980,7 @@ struct llama_server_context
|
||||||
{"content", tkn.text_to_send},
|
{"content", tkn.text_to_send},
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
{"slot_id", slot.id},
|
{"slot_id", slot.id},
|
||||||
{"multimodal", multimodal}
|
{"multimodal", false}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
|
@ -1299,84 +1148,6 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// for multiple images processing
|
|
||||||
bool ingest_images(server_slot &slot, int n_batch)
|
|
||||||
{
|
|
||||||
int image_idx = 0;
|
|
||||||
|
|
||||||
while (image_idx < (int) slot.images.size())
|
|
||||||
{
|
|
||||||
slot_image &img = slot.images[image_idx];
|
|
||||||
|
|
||||||
// process prefix prompt
|
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
|
||||||
{
|
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
|
||||||
llama_batch batch_view = {
|
|
||||||
n_tokens,
|
|
||||||
batch.token + i,
|
|
||||||
nullptr,
|
|
||||||
batch.pos + i,
|
|
||||||
batch.n_seq_id + i,
|
|
||||||
batch.seq_id + i,
|
|
||||||
batch.logits + i,
|
|
||||||
0, 0, 0, // unused
|
|
||||||
};
|
|
||||||
if (llama_decode(ctx, batch_view))
|
|
||||||
{
|
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// process image with llm
|
|
||||||
for (int i = 0; i < img.image_tokens; i += n_batch)
|
|
||||||
{
|
|
||||||
int n_eval = img.image_tokens - i;
|
|
||||||
if (n_eval > n_batch)
|
|
||||||
{
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
|
||||||
llama_batch batch_img = {
|
|
||||||
n_eval,
|
|
||||||
nullptr,
|
|
||||||
(img.image_embedding + i * n_embd),
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
slot.n_past,
|
|
||||||
1, 0
|
|
||||||
};
|
|
||||||
if (llama_decode(ctx, batch_img))
|
|
||||||
{
|
|
||||||
LOG_TEE("%s : failed to eval image\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
slot.n_past += n_eval;
|
|
||||||
}
|
|
||||||
image_idx++;
|
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
|
||||||
|
|
||||||
// append prefix of next image
|
|
||||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
|
||||||
slot.params.input_suffix : // no more images, then process suffix prompt
|
|
||||||
(json)(slot.images[image_idx].prefix_prompt);
|
|
||||||
|
|
||||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
|
||||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
|
||||||
{
|
|
||||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
|
||||||
slot.n_past += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void request_cancel(int task_id)
|
void request_cancel(int task_id)
|
||||||
{
|
{
|
||||||
task_server task;
|
task_server task;
|
||||||
|
@ -1419,7 +1190,7 @@ struct llama_server_context
|
||||||
switch (task.type)
|
switch (task.type)
|
||||||
{
|
{
|
||||||
case TASK_TYPE_COMPLETION: {
|
case TASK_TYPE_COMPLETION: {
|
||||||
server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
server_slot * slot = get_slot(json_value(task.data, "slot_id", -1));
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
// if no slot is available, we defer this task for processing later
|
// if no slot is available, we defer this task for processing later
|
||||||
|
@ -1437,7 +1208,7 @@ struct llama_server_context
|
||||||
system_prompt_process(task.data["system_prompt"]);
|
system_prompt_process(task.data["system_prompt"]);
|
||||||
|
|
||||||
// reset cache_tokens for all slots
|
// reset cache_tokens for all slots
|
||||||
for (server_slot &slot : slots)
|
for (server_slot & slot : slots)
|
||||||
{
|
{
|
||||||
slot.cache_tokens.clear();
|
slot.cache_tokens.clear();
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -1452,7 +1223,7 @@ struct llama_server_context
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
slot->multitask_id = task.multitask_id;
|
slot->multitask_id = task.multitask_id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(*slot, task.data))
|
||||||
{
|
{
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task, "internal_error");
|
send_error(task, "internal_error");
|
||||||
|
@ -1583,7 +1354,7 @@ struct llama_server_context
|
||||||
task.target_id = -1;
|
task.target_id = -1;
|
||||||
queue_tasks.post(task);
|
queue_tasks.post(task);
|
||||||
|
|
||||||
for (server_slot &slot : slots)
|
for (server_slot & slot : slots)
|
||||||
{
|
{
|
||||||
if (slot.ga_n == 1)
|
if (slot.ga_n == 1)
|
||||||
{
|
{
|
||||||
|
@ -1670,7 +1441,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||||
|
|
||||||
// empty prompt passed -> release the slot and send empty response
|
// empty prompt passed -> release the slot and send empty response
|
||||||
// note: infill mode allows empty prompt
|
// note: infill mode allows empty prompt
|
||||||
|
@ -1737,6 +1508,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> new_tokens(
|
std::vector<llama_token> new_tokens(
|
||||||
prompt_tokens.begin(),
|
prompt_tokens.begin(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep);
|
prompt_tokens.begin() + slot.params.n_keep);
|
||||||
|
|
||||||
new_tokens.insert(
|
new_tokens.insert(
|
||||||
new_tokens.end(),
|
new_tokens.end(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
||||||
|
@ -1762,12 +1534,13 @@ struct llama_server_context
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
slot.n_past_se = 0;
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (auto &token : prompt_tokens)
|
for (auto & token : prompt_tokens)
|
||||||
{
|
{
|
||||||
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
||||||
}
|
}
|
||||||
|
@ -1828,13 +1601,14 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
const int p0 = (int) system_tokens.size() + slot.n_past;
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||||
|
|
||||||
LOG_INFO("kv cache rm [p0, end)", {
|
LOG_INFO("kv cache rm [p0, end)", {
|
||||||
{ "slot_id", slot.id },
|
{ "slot_id", slot.id },
|
||||||
{ "task_id", slot.task_id },
|
{ "task_id", slot.task_id },
|
||||||
{ "p0", p0 }
|
{ "p0", p0 }
|
||||||
});
|
});
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", slot.n_past},
|
{"n_past", slot.n_past},
|
||||||
|
@ -1842,10 +1616,7 @@ struct llama_server_context
|
||||||
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
||||||
});
|
});
|
||||||
|
|
||||||
const bool has_images = process_images(slot);
|
std::vector<llama_token> prefix_tokens = prompt_tokens;
|
||||||
|
|
||||||
// process the prefix of first image
|
|
||||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
|
||||||
|
|
||||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||||
|
|
||||||
|
@ -1867,18 +1638,6 @@ struct llama_server_context
|
||||||
slot_npast++;
|
slot_npast++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (has_images && !ingest_images(slot, n_batch))
|
|
||||||
{
|
|
||||||
LOG_ERROR("failed processing images", {
|
|
||||||
{"slot_id", slot.id},
|
|
||||||
{"task_id", slot.task_id},
|
|
||||||
});
|
|
||||||
// FIXME @phymbert: to be properly tested
|
|
||||||
// early returning without changing the slot state will block the slot for ever
|
|
||||||
// no one at the moment is checking the return value
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
if (batch.n_tokens > 0)
|
if (batch.n_tokens > 0)
|
||||||
{
|
{
|
||||||
|
@ -1918,8 +1677,8 @@ struct llama_server_context
|
||||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||||
|
|
||||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||||
|
|
||||||
slot.n_past_se -= bd;
|
slot.n_past_se -= bd;
|
||||||
|
|
||||||
|
@ -1964,7 +1723,7 @@ struct llama_server_context
|
||||||
|
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
|
if (slot.state != PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -2023,7 +1782,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
json model_meta() {
|
json model_meta() {
|
||||||
return json{
|
return json {
|
||||||
{"vocab_type", llama_vocab_type(model)},
|
{"vocab_type", llama_vocab_type(model)},
|
||||||
{"n_vocab", llama_n_vocab(model)},
|
{"n_vocab", llama_n_vocab(model)},
|
||||||
{"n_ctx_train", llama_n_ctx_train(model)},
|
{"n_ctx_train", llama_n_ctx_train(model)},
|
||||||
|
@ -2105,7 +1864,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" KV cache data type for K (default: f16)\n");
|
printf(" KV cache data type for K (default: f16)\n");
|
||||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||||
printf(" KV cache data type for V (default: f16)\n");
|
printf(" KV cache data type for V (default: f16)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
|
||||||
printf(" --log-format log output format: json or text (default: json)\n");
|
printf(" --log-format log output format: json or text (default: json)\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
|
@ -2563,15 +2321,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
else if (arg == "-ctv" || arg == "--cache-type-v") {
|
else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
params.cache_type_v = argv[++i];
|
params.cache_type_v = argv[++i];
|
||||||
}
|
}
|
||||||
else if(arg == "--mmproj")
|
|
||||||
{
|
|
||||||
if (++i >= argc)
|
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.mmproj = argv[i];
|
|
||||||
}
|
|
||||||
else if (arg == "--log-format")
|
else if (arg == "--log-format")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2690,10 +2439,10 @@ static json format_partial_response(
|
||||||
) {
|
) {
|
||||||
json res = json
|
json res = json
|
||||||
{
|
{
|
||||||
{"content", content },
|
{"content", content},
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
{"slot_id", slot->id },
|
{"slot_id", slot->id},
|
||||||
{"multimodal", llama.multimodal }
|
{"multimodal", false},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot->sparams.n_probs > 0)
|
if (slot->sparams.n_probs > 0)
|
||||||
|
@ -3405,19 +3154,10 @@ int main(int argc, char **argv)
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
json image_data;
|
|
||||||
if (body.count("image_data") != 0) {
|
|
||||||
image_data = body["image_data"];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
image_data = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
// create and queue the task
|
// create and queue the task
|
||||||
const int task_id = llama.queue_tasks.get_new_id();
|
const int task_id = llama.queue_tasks.get_new_id();
|
||||||
llama.queue_results.add_waiting_task_id(task_id);
|
llama.queue_results.add_waiting_task_id(task_id);
|
||||||
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
|
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||||
|
|
||||||
// get the result
|
// get the result
|
||||||
task_result result = llama.queue_results.recv(task_id);
|
task_result result = llama.queue_results.recv(task_id);
|
||||||
|
@ -3487,18 +3227,6 @@ int main(int argc, char **argv)
|
||||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
||||||
});
|
});
|
||||||
|
|
||||||
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
|
|
||||||
// "Bus error: 10" - this is on macOS, it does not crash on Linux
|
|
||||||
//std::thread t2([&]()
|
|
||||||
/*{
|
|
||||||
bool running = true;
|
|
||||||
while (running)
|
|
||||||
{
|
|
||||||
running = llama.update_slots();
|
|
||||||
}
|
|
||||||
}*/
|
|
||||||
//);
|
|
||||||
|
|
||||||
if (sparams.n_threads_http < 1) {
|
if (sparams.n_threads_http < 1) {
|
||||||
// +2 threads for monitoring endpoints
|
// +2 threads for monitoring endpoints
|
||||||
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include "json.hpp"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
@ -7,10 +12,6 @@
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "json.hpp"
|
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
extern bool server_verbose;
|
extern bool server_verbose;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue