Merge branch 'master' into server-oai-compat

This commit is contained in:
Georgi Gerganov 2023-11-24 10:54:08 +02:00 committed by GitHub
commit 80724eb0e1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 285 additions and 11 deletions

View file

@ -1,5 +1,5 @@
// A basic application simulating a server with multiple clients.
// The clients submite requests to the server and they are processed in parallel.
// The clients submit requests to the server and they are processed in parallel.
#include "common.h"
#include "llama.h"
@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
// insert new requests as soon as the previous one is done
const bool cont_batching = params.cont_batching;
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("parallel", "log"));
LOG_TEE("Log start\n");
@ -172,6 +174,8 @@ int main(int argc, char ** argv) {
int32_t n_total_gen = 0;
int32_t n_cache_miss = 0;
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
@ -201,6 +205,11 @@ int main(int argc, char ** argv) {
LOG_TEE("Processing requests ...\n\n");
while (true) {
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40);
}
llama_batch_clear(batch);
// decode any currently ongoing sequences

View file

@ -1112,6 +1112,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = id;
res.stop = false;
res.error = true;
res.result_json = { { "content", error } };
queue_results.push_back(res);
@ -1284,6 +1285,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.target_id = 0;
task.data = std::move(data);
task.infill_mode = infill;
task.embedding_mode = embedding;