Merge pull request #4 from anon998/logging

Add the --verbose flag and request logging.
2023-05-30 20:58:18 -04:00 · 2023-05-30 20:58:18 -04:00 · 9197674a6b
commit 9197674a6b
parent e6de69abfb aa0788b650
2 changed files with 147 additions and 14 deletions
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,6 +1,11 @@
 set(TARGET server)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp json.hpp httplib.h)
+target_compile_definitions(${TARGET} PRIVATE
+    $<$<CONFIG:Debug>:
+        CPPHTTPLIB_NO_EXCEPTIONS=1
+    >
+)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -9,6 +9,7 @@ struct server_params
  int32_t port = 8080;
  int32_t read_timeout = 600;
  int32_t write_timeout = 600;
+  bool verbose = false;
 };

 struct llama_server_context
@ -34,6 +35,9 @@ struct llama_server_context

  std::string stopping_word;

+  bool verbose = false;
+  int json_indent = -1;
+
  void rewind() {
    params.antiprompt.clear();
    num_tokens_predicted = 0;
@ -239,6 +243,9 @@ struct llama_server_context
    if (!embd.empty() && embd.back() == llama_token_eos()) {
        stopping_word = llama_token_to_str(ctx, embd.back());
        has_next_token = false;
+        if (verbose) {
+            fprintf(stderr, "eos token found!\n");
+        }
        return result;
    }

@ -256,6 +263,20 @@ struct llama_server_context
    std::string token_text = llama_token_to_str(ctx, token);
    generated_text += token_text;

+    if (verbose) {
+      fprintf(stderr,
+              "next token: {\n"
+              "    token: %d,\n"
+              "    token_text: \"%s\",\n"
+              "    has_next_token: %d,\n"
+              "    n_remain: %ld,\n"
+              "    num_tokens_predicted: %ld,\n"
+              "    stopping_word: \"%s\",\n"
+              "}\n",
+              token, token_text.c_str(), has_next_token, n_remain, num_tokens_predicted,
+              stopping_word.c_str());
+    }
+
    for (const std::string& word : params.antiprompt) {
      size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size()));
      if (i != std::string::npos) {
@ -298,6 +319,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params, con
  fprintf(stderr, "\n");
  fprintf(stderr, "options:\n");
  fprintf(stderr, "  -h, --help            show this help message and exit\n");
+  fprintf(stderr, "  -v, --verbose         verbose output (default: false)\n");
  fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
  fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
  fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -454,6 +476,8 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
            break;
        }
        params.lora_base = argv[i];
+    } else if (arg == "-v" || arg == "--verbose") {
+        sparams.verbose = true;
    }
    else
    {
@ -627,7 +651,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
    json data = {
        {"status", "error"},
        {"reason", "You need to pass the prompt"}};
-    res.set_content(data.dump(), "application/json");
+    res.set_content(data.dump(llama.json_indent), "application/json");
    res.status = 400;
    return false;
  }
@ -639,6 +663,45 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
  {
      llama.params.antiprompt.clear();
  }
+
+  if (llama.verbose) {
+      std::string tmp_stop =
+          std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(),
+                          std::string{}, [](std::string a, std::string b) {
+                              return a + (a != "" ? ", \"" : "") + b + "\"";
+                          });
+
+      fprintf(stderr,
+              "-------------------------\n"
+              "/completion parameters: {\n"
+              "    stream: %d,\n"
+              "    frequency_penalty: %f,\n"
+              "    mirostat: %d,\n"
+              "    mirostat_eta: %f,\n"
+              "    mirostat_tau: %f,\n"
+              "    n_keep: %d,\n"
+              "    n_predict: %d,\n"
+              "    penalize_nl: %d,\n"
+              "    presence_penalty: %f,\n"
+              "    repeat_last_n: %d,\n"
+              "    repeat_penalty: %f,\n"
+              "    seed: %d,\n"
+              "    stop: [%s],\n"
+              "    temperature: %f,\n"
+              "    tfs_z: %f,\n"
+              "    top_k: %d,\n"
+              "    top_p: %f,\n"
+              "    typical_p: %f,\n"
+              "}\nPROMPT[%s]\n",
+              llama.stream, llama.params.frequency_penalty, llama.params.mirostat,
+              llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep,
+              llama.params.n_predict, llama.params.penalize_nl,
+              llama.params.presence_penalty, llama.params.repeat_last_n,
+              llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(),
+              llama.params.temp, llama.params.tfs_z, llama.params.top_k,
+              llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str());
+  }
+
  return true;
 }

@ -661,6 +724,44 @@ json format_generation_settings(const llama_server_context& llama) {
  };
 }

+std::string log(const Request &req, const Response &res)
+{
+    std::string s;
+
+    s += "============ REQUEST ===========\n";
+    s += "< ";
+    s += req.method;
+    s += " ";
+    s += req.path;
+    s += " ";
+    s += req.version;
+    s += "\n";
+
+    if (!req.body.empty()) {
+        std::string line;
+        std::istringstream stream(req.body);
+        while (std::getline(stream, line)) {
+            s += "< " + line + "\n";
+        }
+    }
+
+    s += "------------ RESPONSE ------------\n> ";
+    s += res.version;
+    s += " ";
+    s += std::to_string(res.status);
+    s += "\n";
+
+    if (!res.body.empty()) {
+        std::string line;
+        std::istringstream stream(res.body);
+        while (std::getline(stream, line)) {
+            s += "> " + line + "\n";
+        }
+    }
+
+    return s;
+}
+
 int main(int argc, char **argv)
 {
  // own arguments required by this example
@ -676,6 +777,9 @@ int main(int argc, char **argv)
    return 1;
  }

+  llama.verbose = sparams.verbose;
+  llama.json_indent = sparams.verbose ? 4 : -1;
+
  // load the model
  if (!llama.loadModel(params))
  {
@ -692,7 +796,8 @@ int main(int argc, char **argv)
          json data = {
              {"status", "error"},
              {"reason", "To use completion function, disable embedding mode"}};
-          res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
+          res.set_content(
+              data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
              "application/json");
          res.status = 400;
          return;
@ -706,7 +811,8 @@ int main(int argc, char **argv)

      if (!llama.loadPrompt()) {
          json data = {{"status", "error"}, {"reason", "Context too long."}};
-          res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
+          res.set_content(
+              data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
              "application/json");
          res.status = 400;
          return;
@ -721,12 +827,14 @@ int main(int argc, char **argv)

          json data = {{"content", llama.generated_text},
                       {"stop", true},
-                       {"model", llama.params.model_alias },
+                       {"model", llama.params.model_alias},
                       {"tokens_predicted", llama.num_tokens_predicted},
                       {"generation_settings", format_generation_settings(llama)},
                       {"prompt", llama.params.prompt},
                       {"stopping_word", llama.stopping_word}};
-          return res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+          return res.set_content(
+              data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
+              "application/json");
      } else {
          const auto chunked_content_provider = [&](size_t, DataSink &sink) {
              while (llama.has_next_token) {
@ -748,9 +856,9 @@ int main(int argc, char **argv)
                          {"generated_text", llama.generated_text}};
                  }

-                  std::string str =
-                      "data: " +
-                      data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                  std::string str = "data: " +
+                                    data.dump(llama.json_indent, ' ', false,
+                                              json::error_handler_t::replace) +
                                    "\n\n";
                  sink.write(str.data(), str.size());
              }
@ -768,7 +876,7 @@ int main(int argc, char **argv)
              json body = json::parse(req.body);
              json data = {
                    {"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(llama.json_indent), "application/json");
            });

  svr.Post("/embedding", [&llama](const Request &req, Response &res)
@ -778,14 +886,14 @@ int main(int argc, char **argv)
                json data = {
                    {"embedding", empty}};
                fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(llama.json_indent), "application/json");
              }
              json body = json::parse(req.body);
              std::string content = body["content"].get<std::string>();
              int threads = body["threads"].get<int>();
              json data = {
                    {"embedding", llama.embedding(content, threads) } };
-              return res.set_content(data.dump(), "application/json");
+              return res.set_content(data.dump(llama.json_indent), "application/json");
            });

  fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
@ -794,6 +902,26 @@ int main(int argc, char **argv)
    fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
  }

+  if (llama.verbose) {
+      svr.set_logger([](const Request &req, const Response &res) {
+          fprintf(stderr, "%s", log(req, res).c_str());
+      });
+  }
+
+  svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
+      auto fmt = "500 Internal Server Error\n%s";
+      char buf[BUFSIZ];
+      try {
+          std::rethrow_exception(ep);
+      } catch (std::exception &e) {
+          snprintf(buf, sizeof(buf), fmt, e.what());
+      } catch (...) {
+          snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+      }
+      res.set_content(buf, "text/plain");
+      res.status = 500;
+  });
+
  // set timeouts and change hostname and port
  svr.set_read_timeout(sparams.read_timeout);
  svr.set_write_timeout(sparams.write_timeout);