diff --git a/expose.cpp b/expose.cpp index f85f411c4..6b2e36feb 100644 --- a/expose.cpp +++ b/expose.cpp @@ -220,12 +220,12 @@ extern "C" return generation_finished; } - float get_prompt_eval_time() { - return prompt_eval_time; + float get_last_eval_time() { + return last_eval_time; } - float get_prompt_process_time() { - return prompt_process_time; + float get_last_process_time() { + return last_process_time; } const char* get_pending_output() { diff --git a/expose.h b/expose.h index fc6949a52..2e88946d7 100644 --- a/expose.h +++ b/expose.h @@ -72,5 +72,5 @@ extern std::string lora_filename; extern std::string lora_base; extern std::vector generated_tokens; extern bool generation_finished; -extern float prompt_eval_time; -extern float prompt_process_time; +extern float last_eval_time; +extern float last_process_time; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ffc0017a2..a6d65133b 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -33,8 +33,8 @@ std::string executable_path = ""; std::string lora_filename = ""; std::string lora_base = ""; bool generation_finished; -float prompt_process_time; -float prompt_eval_time; +float last_process_time = 0; +float last_eval_time = 0; std::vector generated_tokens; //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) @@ -869,8 +869,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o bool stream_sse = inputs.stream_sse; generation_finished = false; // Set current generation status - prompt_eval_time = 0; - prompt_process_time = 0; generated_tokens.clear(); // New Generation, new tokens if (params.repeat_last_n < 1) @@ -1449,8 +1447,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o fflush(stdout); output.status = 1; generation_finished = true; - prompt_eval_time = pt2; - prompt_process_time = pt1; + last_eval_time = pt2; + last_process_time = pt1; snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str()); return output; diff --git a/koboldcpp.py b/koboldcpp.py index fa23e2645..8ca30f9de 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -161,8 +161,8 @@ def init_library(): handle.new_token.argtypes = [ctypes.c_int] handle.get_stream_count.restype = ctypes.c_int handle.has_finished.restype = ctypes.c_bool - handle.get_prompt_eval_time.restype = ctypes.c_float - handle.get_prompt_process_time.restype = ctypes.c_float + handle.get_last_eval_time.restype = ctypes.c_float + handle.get_last_process_time.restype = ctypes.c_float handle.abort_generate.restype = ctypes.c_bool handle.get_pending_output.restype = ctypes.c_char_p @@ -455,6 +455,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): elif self.path.endswith(('/api/extra/version')): response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode()) + elif self.path.endswith(('/api/extra/perf')): + lastp = handle.get_last_process_time() + laste = handle.get_last_eval_time() + response_body = (json.dumps({"last_process":lastp,"last_eval":laste}).encode()) + if response_body is None: self.send_response(404) self.end_headers() @@ -532,8 +537,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): newprompt = fullprompt gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag)) - gen['prompt_process_time'] = handle.get_prompt_process_time() - gen['prompt_eval_time'] = handle.get_prompt_eval_time() try: self.send_response(200) self.end_headers()