Trying to understand server.cpp
This commit is contained in:
parent
5249985578
commit
242f0e1c1f
7 changed files with 52 additions and 10 deletions
|
@ -2,6 +2,7 @@ from queue import Queue
|
||||||
import threading
|
import threading
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
def print_dict(data):
|
def print_dict(data):
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
|
@ -38,6 +39,8 @@ def make_progress_bar(bar, count, num_requests):
|
||||||
|
|
||||||
def send_request(q, question, event, count, num_requests):
|
def send_request(q, question, event, count, num_requests):
|
||||||
|
|
||||||
|
delay = 0.1
|
||||||
|
|
||||||
global bar
|
global bar
|
||||||
|
|
||||||
data = {'prompt': question}
|
data = {'prompt': question}
|
||||||
|
@ -58,8 +61,12 @@ def send_request(q, question, event, count, num_requests):
|
||||||
elif response.status_code == 429 and not q.empty():
|
elif response.status_code == 429 and not q.empty():
|
||||||
event.set()
|
event.set()
|
||||||
print("Server return too many requests; back off!! Reset event.")
|
print("Server return too many requests; back off!! Reset event.")
|
||||||
|
else:
|
||||||
|
print(f"Server responded with code {response.status_code}\n")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Server returned exception error {e}")
|
print(f"Server returned exception error {e}")
|
||||||
|
sleep(delay)
|
||||||
|
delay *= 2
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
@ -94,7 +101,6 @@ if __name__ == "__main__":
|
||||||
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
||||||
t.start()
|
t.start()
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
# input("Any key",)
|
|
||||||
|
|
||||||
for thread in threads:
|
for thread in threads:
|
||||||
thread.join() # wait for all threads to finish
|
thread.join() # wait for all threads to finish
|
||||||
|
|
14
common/log.h
14
common/log.h
|
@ -41,6 +41,18 @@
|
||||||
// log_set_target( FILE* )
|
// log_set_target( FILE* )
|
||||||
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
||||||
//
|
//
|
||||||
|
// One way to find log files is to give a parameter or a name a strange value and then search for it
|
||||||
|
// that is how I discovered the llama.log inside build and log.h inside common
|
||||||
|
// although I don't know why it isn't called server.log
|
||||||
|
// note that server has logging on by default and it can only be disabled, not enabled
|
||||||
|
//
|
||||||
|
// and this very useful and informative log.h file is inside examples/common.
|
||||||
|
//
|
||||||
|
// As appears below, the llama.log is overwritten on every run if the default is used
|
||||||
|
// so it would be preferable to be able to assign a log file when the server is loaded
|
||||||
|
// or to have a flag that allows timestamped logs to be generated when required.
|
||||||
|
// It isn't very satisfactory to have to add a new filename to the code.
|
||||||
|
//
|
||||||
// --------
|
// --------
|
||||||
//
|
//
|
||||||
// End of Basic usage.
|
// End of Basic usage.
|
||||||
|
@ -112,7 +124,7 @@ inline std::string log_get_pid()
|
||||||
static std::string pid;
|
static std::string pid;
|
||||||
if (pid.empty())
|
if (pid.empty())
|
||||||
{
|
{
|
||||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||||
// trying to write to the same log.
|
// trying to write to the same log.
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
|
|
|
@ -15,11 +15,11 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND
|
#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND
|
||||||
#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
|
#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 // originally 5
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
|
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
|
||||||
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 // originally 5
|
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 15 // originally 5
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
|
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
|
||||||
|
@ -31,7 +31,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND
|
#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND
|
||||||
#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5
|
#define CPPHTTPLIB_READ_TIMEOUT_SECOND 15 // originally 5
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND
|
#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND
|
||||||
|
@ -47,7 +47,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
|
#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
|
||||||
#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0
|
#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 5 // originally 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND
|
#ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND
|
||||||
|
|
|
@ -20312,7 +20312,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
|
||||||
basic_json(const basic_json& other)
|
basic_json(const basic_json& other)
|
||||||
: m_type(other.m_type)
|
: m_type(other.m_type)
|
||||||
{
|
{
|
||||||
// check of passed value is valid
|
// check if passed value is valid
|
||||||
other.assert_invariant();
|
other.assert_invariant();
|
||||||
|
|
||||||
switch (m_type)
|
switch (m_type)
|
||||||
|
|
|
@ -3080,7 +3080,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
|
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
|
||||||
|
|
||||||
svr.set_default_headers({{"Server", "PJllama.cpp"}});
|
svr.set_default_headers({{"Server", "llama.cpp"}});
|
||||||
|
|
||||||
// CORS preflight (Cross-Origin Resource Sharing)
|
// CORS preflight (Cross-Origin Resource Sharing)
|
||||||
svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res)
|
svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res)
|
||||||
|
@ -3566,6 +3566,22 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
); // get a SIGABORT error on exception here as GG says above when in Debug but not in Release
|
); // get a SIGABORT error on exception here as GG says above when in Debug but not in Release
|
||||||
|
|
||||||
|
// copied from a later server.cpp
|
||||||
|
llama.queue_tasks.on_new_task(std::bind(
|
||||||
|
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
||||||
|
llama.queue_tasks.on_finish_multitask(std::bind(
|
||||||
|
&llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
|
||||||
|
llama.queue_tasks.on_all_tasks_finished(std::bind(
|
||||||
|
&llama_server_context::run_on_all_tasks_finished, &llama));
|
||||||
|
llama.queue_results.on_multitask_update(std::bind(
|
||||||
|
&llama_server_queue::update_multitask,
|
||||||
|
&llama.queue_tasks,
|
||||||
|
std::placeholders::_1,
|
||||||
|
std::placeholders::_2,
|
||||||
|
std::placeholders::_3
|
||||||
|
));
|
||||||
|
llama.queue_tasks.start_loop();
|
||||||
|
|
||||||
t2.join(); // was originally t.join() despite t2 in line 3533 above
|
t2.join(); // was originally t.join() despite t2 in line 3533 above
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
|
@ -6718,7 +6718,7 @@ static int llama_decode_internal(
|
||||||
// if we start defragmenting the cache, the benefit from this will be more important
|
// if we start defragmenting the cache, the benefit from this will be more important
|
||||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||||
|
// line above and below originally commented out
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
ggml_allocr_reset(lctx.alloc);
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
@ -10741,7 +10741,7 @@ void llama_batch_free(struct llama_batch batch) {
|
||||||
|
|
||||||
int32_t llama_decode(
|
int32_t llama_decode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch) {
|
struct llama_batch batch) {
|
||||||
const int ret = llama_decode_internal(*ctx, batch);
|
const int ret = llama_decode_internal(*ctx, batch);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
||||||
|
|
8
test.sh
Executable file
8
test.sh
Executable file
|
@ -0,0 +1,8 @@
|
||||||
|
make -j tests
|
||||||
|
export MTL_DEBUG_LAYER=1
|
||||||
|
export MTL_SHADER_VALIDATION=1
|
||||||
|
export MTL_SHADER_VALIDATION_REPORT_TO_STDERR=1
|
||||||
|
export MTL_SHADER_VALIDATION_FAIL_MODE=allow
|
||||||
|
export MTL_DEBUG_LAYER_VALIDATE_STORE_ACTIONS=1
|
||||||
|
export MTL_DEBUG_LAYER_VALIDATE_LOAD_ACTIONS=1
|
||||||
|
./tests/test-backend-ops -b Metal -o MUL_MAT
|
Loading…
Add table
Add a link
Reference in a new issue