From 5c1980d8d4c4e0c0af77359f81cc44d90b3f250b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:10:34 +0200 Subject: [PATCH 01/11] server : fix build + rename enums (#4870) --- examples/server/server.cpp | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1cca634d5..4a0714997 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -147,15 +147,15 @@ static std::vector base64_decode(const std::string & encoded_string) // parallel // -enum ServerState { - LOADING_MODEL, // Server is starting up, model not fully loaded yet - READY, // Server is ready and model is loaded - ERROR // An error occurred, load_model failed +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed }; enum task_type { - COMPLETION_TASK, - CANCEL_TASK + TASK_TYPE_COMPLETION, + TASK_TYPE_CANCEL, }; struct task_server { @@ -1402,7 +1402,7 @@ struct llama_server_context task.data = std::move(data); task.infill_mode = infill; task.embedding_mode = embedding; - task.type = COMPLETION_TASK; + task.type = TASK_TYPE_COMPLETION; task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests @@ -1524,7 +1524,7 @@ struct llama_server_context std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; - task.type = CANCEL_TASK; + task.type = TASK_TYPE_CANCEL; task.target_id = task_id; queue_tasks.push_back(task); condition_tasks.notify_one(); @@ -1560,7 +1560,7 @@ struct llama_server_context queue_tasks.erase(queue_tasks.begin()); switch (task.type) { - case COMPLETION_TASK: { + case TASK_TYPE_COMPLETION: { llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { @@ -1589,7 +1589,7 @@ struct llama_server_context break; } } break; - case CANCEL_TASK: { // release slot linked with the task id + case TASK_TYPE_CANCEL: { // release slot linked with the task id for (auto & slot : slots) { if (slot.task_id == task.target_id) @@ -2798,24 +2798,24 @@ int main(int argc, char **argv) httplib::Server svr; - std::atomic server_state{LOADING_MODEL}; + std::atomic state{SERVER_STATE_LOADING_MODEL}; svr.set_default_headers({{"Server", "llama.cpp"}, {"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Headers", "content-type"}}); svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { - ServerState current_state = server_state.load(); + server_state current_state = state.load(); switch(current_state) { - case READY: + case SERVER_STATE_READY: res.set_content(R"({"status": "ok"})", "application/json"); res.status = 200; // HTTP OK break; - case LOADING_MODEL: + case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json"); res.status = 503; // HTTP Service Unavailable break; - case ERROR: + case SERVER_STATE_ERROR: res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); res.status = 500; // HTTP Internal Server Error break; @@ -2891,7 +2891,7 @@ int main(int argc, char **argv) { if (!svr.listen_after_bind()) { - server_state.store(ERROR); + state.store(SERVER_STATE_ERROR); return 1; } @@ -2901,11 +2901,11 @@ int main(int argc, char **argv) // load the model if (!llama.load_model(params)) { - server_state.store(ERROR); + state.store(SERVER_STATE_ERROR); return 1; } else { llama.initialize(); - server_state.store(READY); + state.store(SERVER_STATE_READY); } // Middleware for API key validation From 7a9f75c38b5e62fe27b8a5a3ed823b4a3714024b Mon Sep 17 00:00:00 2001 From: Behnam M <58621210+ibehnam@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:12:05 -0500 Subject: [PATCH 02/11] server : update readme to document the new `/health` endpoint (#4866) * added /health endpoint to the server * added comments on the additional /health endpoint * Better handling of server state When the model is being loaded, the server state is `LOADING_MODEL`. If model-loading fails, the server state becomes `ERROR`, otherwise it becomes `READY`. The `/health` endpoint provides more granular messages now according to the server_state value. * initialized server_state * fixed a typo * starting http server before initializing the model * Update server.cpp * Update server.cpp * fixes * fixes * fixes * made ServerState atomic and turned two-line spaces into one-line * updated `server` readme to document the `/health` endpoint too --- examples/server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index d85a14f89..dc27e72b9 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -110,6 +110,10 @@ node index.js ``` ## API Endpoints +- **GET** `/health`: Returns the current state of the server: + - `{"status": "loading model"}` if the model is still being loaded. + - `{"status": "error"}` if the model failed to load. + - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. From f34432ca1e0b288129390c1db8296a82aaf1e632 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Fri, 5 Jan 2024 16:00:00 +0100 Subject: [PATCH 03/11] fix : cuda order of synchronization when setting a buffer (ggml/679) * fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e26260a35..900f7ba4a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10184,8 +10184,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { From c910e3c28a1caee8cb1398143d582dd9ab697e68 Mon Sep 17 00:00:00 2001 From: Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:16:37 -0500 Subject: [PATCH 04/11] Fix execlp call (ggml/689) NULL can be an integer constant expression with the value zero, in this case the behavior would be undefined because of an incorrect type being passed to the variable arguments. --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index adb387100..4a0ec4c44 100644 --- a/ggml.c +++ b/ggml.c @@ -132,7 +132,7 @@ void ggml_print_backtrace(void) { "-ex", "bt -frame-info source-and-location", "-ex", "detach", "-ex", "quit", - NULL); + (char *) NULL); } else { waitpid(pid, NULL, 0); } From e739de790921e6abbc8c70398303cacd74913f61 Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 10 Jan 2024 21:13:42 +0800 Subject: [PATCH 05/11] ggml : change GGML_MAX_NAME at compile time (ggml/682) * change GGML_MAX_NAME to 128 * allow controlling the value of GGML_MAX_NAME through external macro definitions --- ggml.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.h b/ggml.h index c55e598b4..b6cc85952 100644 --- a/ggml.h +++ b/ggml.h @@ -218,7 +218,9 @@ #define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 +#ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 +#endif #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 From 5362e43962e84d61e20b91f34991d7ccaef4a7d5 Mon Sep 17 00:00:00 2001 From: Jack Mousseau Date: Wed, 10 Jan 2024 06:19:19 -0800 Subject: [PATCH 06/11] metal : wrap each operation in debug group (ggml/690) --- ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 6c2a8d04e..161906824 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,6 +1067,8 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst)]]; + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2423,6 +2425,8 @@ bool ggml_metal_graph_compute( GGML_ASSERT(false); } } + + [encoder popDebugGroup]; } if (encoder != nil) { From f85a973aa139ae6f37e8b8e1966f1d278b5e0372 Mon Sep 17 00:00:00 2001 From: Timothy Cronin <40186632+4imothy@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:27:48 -0500 Subject: [PATCH 07/11] ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693) --- ggml.c | 30 ++++++++---------------------- ggml.h | 11 ----------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/ggml.c b/ggml.c index 4a0ec4c44..9c42a45e3 100644 --- a/ggml.c +++ b/ggml.c @@ -4311,13 +4311,13 @@ struct ggml_tensor * ggml_set_2d_inplace( static struct ggml_tensor * ggml_cpy_impl( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { + // inplace is false and either one have a grad is_node = true; } @@ -4341,29 +4341,21 @@ struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, false); -} - -struct ggml_tensor * ggml_cpy_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, true); + return ggml_cpy_impl(ctx, a, b); } // ggml_cont static struct ggml_tensor * ggml_cont_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { + struct ggml_tensor * a) { bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; @@ -4376,13 +4368,7 @@ static struct ggml_tensor * ggml_cont_impl( struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, false); -} - -struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, true); + return ggml_cont_impl(ctx, a); } // make contiguous, with new shape diff --git a/ggml.h b/ggml.h index b6cc85952..127dcef1d 100644 --- a/ggml.h +++ b/ggml.h @@ -1163,22 +1163,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // a -> b, in-place, return view(b) - GGML_API struct ggml_tensor * ggml_cpy_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - // make contiguous GGML_API struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a); - // make contiguous, in-place - GGML_API struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, From 3267c2abc72e34608224408ace3c048831050f97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:34:59 +0200 Subject: [PATCH 08/11] metal : fix deprecation warning (ggml/690) --- ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 161906824..82d68cd1b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,7 +1067,7 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst)]]; + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; From 64802ec00d6383784a9dacf616095eaced16c3c3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jan 2024 09:39:08 +0200 Subject: [PATCH 09/11] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index fe7f3202f..3e2c579d5 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -f96711108d55bdbbd277e6be07204dce6a94fb93 +979cc23b345006504cfc1f67c0fdf627805e3319 From 2a7c94db5fb67b2f8882d2d16a11bf5d8d12d397 Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Thu, 11 Jan 2024 14:31:52 +0000 Subject: [PATCH 10/11] metal : put encoder debug group behind a define (#4873) --- ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 82d68cd1b..9698e5a79 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1067,7 +1067,9 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } +#ifndef GGML_METAL_NDEBUG [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; +#endif const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; @@ -2426,7 +2428,9 @@ bool ggml_metal_graph_compute( } } +#ifndef GGML_METAL_NDEBUG [encoder popDebugGroup]; +#endif } if (encoder != nil) { From 2f043328e3116724d15b915b5c6078e2df860a69 Mon Sep 17 00:00:00 2001 From: Isaac McFadyen Date: Thu, 11 Jan 2024 09:33:26 -0500 Subject: [PATCH 11/11] server : fix typo in model name (#4876) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4a0714997..860e4e9ae 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2515,7 +2515,7 @@ json oaicompat_completion_params_parse( // // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("uknown")); + llama_params["model"] = json_value(body, "model", std::string("unknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0);