Update server.cpp example with correct startup sequence

The HTTP listener start and the health API endpoint are moved before the model loading starts, hence the server can correctly report is loading the model
2024-04-17 18:55:09 +02:00 · 2024-04-17 18:55:09 +02:00 · d9157cdf34
commit d9157cdf34
parent 8dd1ec8b3f
1 changed files with 103 additions and 99 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3009,6 +3009,108 @@ int main(int argc, char ** argv) {
        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
    }

+    const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
+        server_state current_state = state.load();
+        switch (current_state) {
+            case SERVER_STATE_READY:
+                {
+                    // request slots data using task queue
+                    server_task task;
+                    task.id   = ctx_server.queue_tasks.get_new_id();
+                    task.type = SERVER_TASK_TYPE_METRICS;
+                    task.id_target = -1;
+
+                    ctx_server.queue_results.add_waiting_task_id(task.id);
+                    ctx_server.queue_tasks.post(task);
+
+                    // get the result
+                    server_task_result result = ctx_server.queue_results.recv(task.id);
+                    ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+                    const int n_idle_slots       = result.data["idle"];
+                    const int n_processing_slots = result.data["processing"];
+
+                    json health = {
+                        {"status",           "ok"},
+                        {"slots_idle",       n_idle_slots},
+                        {"slots_processing", n_processing_slots}
+                    };
+
+                    res.status = 200; // HTTP OK
+                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
+                        health["slots"] = result.data["slots"];
+                    }
+
+                    if (n_idle_slots == 0) {
+                        health["status"] = "no slot available";
+                        if (req.has_param("fail_on_no_slot")) {
+                            res.status = 503; // HTTP Service Unavailable
+                        }
+                    }
+
+                    res.set_content(health.dump(), "application/json");
+                    break;
+                }
+            case SERVER_STATE_LOADING_MODEL:
+                {
+                    res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+                } break;
+            case SERVER_STATE_ERROR:
+                {
+                    res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
+                } break;
+        }
+    };
+
+    // register Health API routes
+    svr->Get ("/health",              handle_health);
+
+    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
+        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
+            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
+            return false;
+        };
+    };
+
+    //
+    // Router
+    //
+
+    // register static assets routes
+    if (!sparams.public_path.empty()) {
+        // Set the base directory for serving static files
+        svr->set_base_dir(sparams.public_path);
+    }
+
+    // using embedded static files
+    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
+        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+
+    //
+    // Start the server
+    //
+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
+    LOG_INFO("HTTP server listening", log_data);
+
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]() {
+        if (!svr->listen_after_bind()) {
+            state.store(SERVER_STATE_ERROR);
+            return 1;
+        }
+
+        return 0;
+    });
+
    // load the model
    if (!ctx_server.load_model(params)) {
        state.store(SERVER_STATE_ERROR);
@ -3110,59 +3212,6 @@ int main(int argc, char ** argv) {
    // Route handlers (or controllers)
    //

-    const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
-        server_state current_state = state.load();
-        switch (current_state) {
-            case SERVER_STATE_READY:
-                {
-                    // request slots data using task queue
-                    server_task task;
-                    task.id   = ctx_server.queue_tasks.get_new_id();
-                    task.type = SERVER_TASK_TYPE_METRICS;
-                    task.id_target = -1;
-
-                    ctx_server.queue_results.add_waiting_task_id(task.id);
-                    ctx_server.queue_tasks.post(task);
-
-                    // get the result
-                    server_task_result result = ctx_server.queue_results.recv(task.id);
-                    ctx_server.queue_results.remove_waiting_task_id(task.id);
-
-                    const int n_idle_slots       = result.data["idle"];
-                    const int n_processing_slots = result.data["processing"];
-
-                    json health = {
-                        {"status",           "ok"},
-                        {"slots_idle",       n_idle_slots},
-                        {"slots_processing", n_processing_slots}
-                    };
-
-                    res.status = 200; // HTTP OK
-                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                        health["slots"] = result.data["slots"];
-                    }
-
-                    if (n_idle_slots == 0) {
-                        health["status"] = "no slot available";
-                        if (req.has_param("fail_on_no_slot")) {
-                            res.status = 503; // HTTP Service Unavailable
-                        }
-                    }
-
-                    res.set_content(health.dump(), "application/json");
-                    break;
-                }
-            case SERVER_STATE_LOADING_MODEL:
-                {
-                    res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-                } break;
-            case SERVER_STATE_ERROR:
-                {
-                    res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
-                } break;
-        }
-    };
-
    const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
        if (!sparams.slots_endpoint) {
            res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
@ -3715,32 +3764,8 @@ int main(int argc, char ** argv) {
        return res.set_content(root.dump(), "application/json; charset=utf-8");
    };

-    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
-        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
-            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
-            return false;
-        };
-    };
-
-    //
-    // Router
-    //
-
-    // register static assets routes
-    if (!sparams.public_path.empty()) {
-        // Set the base directory for serving static files
-        svr->set_base_dir(sparams.public_path);
-    }
-
-    // using embedded static files
-    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
-        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));

    // register API routes
-    svr->Get ("/health",              handle_health);
    svr->Get ("/slots",               handle_slots);
    svr->Get ("/metrics",             handle_metrics);
    svr->Get ("/props",               handle_props);
@ -3756,33 +3781,12 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/embeddings",       handle_embeddings);
    svr->Post("/tokenize",            handle_tokenize);
    svr->Post("/detokenize",          handle_detokenize);
+
    if (!sparams.slot_save_path.empty()) {
        // only enable slot endpoints if slot_save_path is set
        svr->Post("/slots/:id_slot",  handle_slots_action);
    }

-    //
-    // Start the server
-    //
-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]() {
-        if (!svr->listen_after_bind()) {
-            state.store(SERVER_STATE_ERROR);
-            return 1;
-        }
-
-        return 0;
-    });
-
    ctx_server.queue_tasks.on_new_task(std::bind(
        &server_context::process_single_task, &ctx_server, std::placeholders::_1));
    ctx_server.queue_tasks.on_finish_multitask(std::bind(