From 5475fc92d7b7ff4b7bb1c0c8bbdb2d7c9f2df8ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bjarke=20Viks=C3=B8e?= <bviksoe@hotmail.com>
Date: Sat, 6 Jul 2024 17:25:52 +0200
Subject: [PATCH] server: Retrieve prompt template in /props

This PR adds the following:
- Expose the model's Jinja2 prompt template from the model in the /props endpoint.
- Change log-level from Error to Warning for warning about template mismatch.

The front-end stands a better chance of actually executing the Jinja template format correctly. Server is currently just guessing it.

Ideally this should have been inside a JSON block that expose the same key/value pairs as listed during startup in "llm_load_print_meta" function.
---
 examples/server/server.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d7fb61812..c7b05cb16 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2605,7 +2605,7 @@ int main(int argc, char ** argv) {
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
     if (params.chat_template.empty()) {
         if (!ctx_server.validate_model_chat_template()) {
-            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
             params.chat_template = "chatml";
         }
     }
@@ -2967,11 +2967,17 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
+        std::string template_key = "tokenizer.chat_template", curr_tmpl;
+        if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), model_template.data(), model_template.size()) > 0) {
+            curr_tmpl = std::string(model_template.data(), model_template.size());
+        }
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
         json data = {
             { "system_prompt",               ctx_server.system_prompt.c_str() },
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel }
+            { "total_slots",                 ctx_server.params.n_parallel },
+            { "model_template",              curr_tmpl.c_str() }
         };
 
         res.set_content(data.dump(), "application/json; charset=utf-8");