expose timing info in web api
This commit is contained in:
parent
7222877069
commit
1d1111e10f
4 changed files with 17 additions and 16 deletions
|
@ -220,12 +220,12 @@ extern "C"
|
||||||
return generation_finished;
|
return generation_finished;
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_prompt_eval_time() {
|
float get_last_eval_time() {
|
||||||
return prompt_eval_time;
|
return last_eval_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_prompt_process_time() {
|
float get_last_process_time() {
|
||||||
return prompt_process_time;
|
return last_process_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* get_pending_output() {
|
const char* get_pending_output() {
|
||||||
|
|
4
expose.h
4
expose.h
|
@ -72,5 +72,5 @@ extern std::string lora_filename;
|
||||||
extern std::string lora_base;
|
extern std::string lora_base;
|
||||||
extern std::vector<std::string> generated_tokens;
|
extern std::vector<std::string> generated_tokens;
|
||||||
extern bool generation_finished;
|
extern bool generation_finished;
|
||||||
extern float prompt_eval_time;
|
extern float last_eval_time;
|
||||||
extern float prompt_process_time;
|
extern float last_process_time;
|
||||||
|
|
|
@ -33,8 +33,8 @@ std::string executable_path = "";
|
||||||
std::string lora_filename = "";
|
std::string lora_filename = "";
|
||||||
std::string lora_base = "";
|
std::string lora_base = "";
|
||||||
bool generation_finished;
|
bool generation_finished;
|
||||||
float prompt_process_time;
|
float last_process_time = 0;
|
||||||
float prompt_eval_time;
|
float last_eval_time = 0;
|
||||||
std::vector<std::string> generated_tokens;
|
std::vector<std::string> generated_tokens;
|
||||||
|
|
||||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||||
|
@ -869,8 +869,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
bool stream_sse = inputs.stream_sse;
|
bool stream_sse = inputs.stream_sse;
|
||||||
|
|
||||||
generation_finished = false; // Set current generation status
|
generation_finished = false; // Set current generation status
|
||||||
prompt_eval_time = 0;
|
|
||||||
prompt_process_time = 0;
|
|
||||||
generated_tokens.clear(); // New Generation, new tokens
|
generated_tokens.clear(); // New Generation, new tokens
|
||||||
|
|
||||||
if (params.repeat_last_n < 1)
|
if (params.repeat_last_n < 1)
|
||||||
|
@ -1449,8 +1447,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
output.status = 1;
|
output.status = 1;
|
||||||
generation_finished = true;
|
generation_finished = true;
|
||||||
prompt_eval_time = pt2;
|
last_eval_time = pt2;
|
||||||
prompt_process_time = pt1;
|
last_process_time = pt1;
|
||||||
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
|
|
11
koboldcpp.py
11
koboldcpp.py
|
@ -161,8 +161,8 @@ def init_library():
|
||||||
handle.new_token.argtypes = [ctypes.c_int]
|
handle.new_token.argtypes = [ctypes.c_int]
|
||||||
handle.get_stream_count.restype = ctypes.c_int
|
handle.get_stream_count.restype = ctypes.c_int
|
||||||
handle.has_finished.restype = ctypes.c_bool
|
handle.has_finished.restype = ctypes.c_bool
|
||||||
handle.get_prompt_eval_time.restype = ctypes.c_float
|
handle.get_last_eval_time.restype = ctypes.c_float
|
||||||
handle.get_prompt_process_time.restype = ctypes.c_float
|
handle.get_last_process_time.restype = ctypes.c_float
|
||||||
handle.abort_generate.restype = ctypes.c_bool
|
handle.abort_generate.restype = ctypes.c_bool
|
||||||
handle.get_pending_output.restype = ctypes.c_char_p
|
handle.get_pending_output.restype = ctypes.c_char_p
|
||||||
|
|
||||||
|
@ -455,6 +455,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
elif self.path.endswith(('/api/extra/version')):
|
elif self.path.endswith(('/api/extra/version')):
|
||||||
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())
|
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())
|
||||||
|
|
||||||
|
elif self.path.endswith(('/api/extra/perf')):
|
||||||
|
lastp = handle.get_last_process_time()
|
||||||
|
laste = handle.get_last_eval_time()
|
||||||
|
response_body = (json.dumps({"last_process":lastp,"last_eval":laste}).encode())
|
||||||
|
|
||||||
if response_body is None:
|
if response_body is None:
|
||||||
self.send_response(404)
|
self.send_response(404)
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
|
@ -532,8 +537,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
newprompt = fullprompt
|
newprompt = fullprompt
|
||||||
|
|
||||||
gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
|
gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
|
||||||
gen['prompt_process_time'] = handle.get_prompt_process_time()
|
|
||||||
gen['prompt_eval_time'] = handle.get_prompt_eval_time()
|
|
||||||
try:
|
try:
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue