diff --git a/Llamaserver.py b/Llamaserver.py index 2b47b86f1..df4f9b8df 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -2,6 +2,7 @@ from queue import Queue import threading import requests import json +from time import sleep def print_dict(data): for k, v in data.items(): @@ -38,6 +39,8 @@ def make_progress_bar(bar, count, num_requests): def send_request(q, question, event, count, num_requests): + delay = 0.1 + global bar data = {'prompt': question} @@ -58,8 +61,12 @@ def send_request(q, question, event, count, num_requests): elif response.status_code == 429 and not q.empty(): event.set() print("Server return too many requests; back off!! Reset event.") + else: + print(f"Server responded with code {response.status_code}\n") except Exception as e: print(f"Server returned exception error {e}") + sleep(delay) + delay *= 2 if __name__ == "__main__": @@ -94,7 +101,6 @@ if __name__ == "__main__": t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) t.start() threads.append(t) - # input("Any key",) for thread in threads: thread.join() # wait for all threads to finish diff --git a/common/log.h b/common/log.h index e4e1b9f4f..bd0cc549a 100644 --- a/common/log.h +++ b/common/log.h @@ -41,6 +41,18 @@ // log_set_target( FILE* ) // allowing to point at stderr, stdout, or any valid FILE* file handler. // +// One way to find log files is to give a parameter or a name a strange value and then search for it +// that is how I discovered the llama.log inside build and log.h inside common +// although I don't know why it isn't called server.log +// note that server has logging on by default and it can only be disabled, not enabled +// +// and this very useful and informative log.h file is inside examples/common. +// +// As appears below, the llama.log is overwritten on every run if the default is used +// so it would be preferable to be able to assign a log file when the server is loaded +// or to have a flag that allows timestamped logs to be generated when required. +// It isn't very satisfactory to have to add a new filename to the code. +// // -------- // // End of Basic usage. @@ -112,7 +124,7 @@ inline std::string log_get_pid() static std::string pid; if (pid.empty()) { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // std::this_thread::get_id() is the most portable way of obtaining a "process id" // it's not the same as "pid" but is unique enough to solve multiple instances // trying to write to the same log. std::stringstream ss; diff --git a/examples/server/httplib.h b/examples/server/httplib.h index 4f08c3df9..72d806516 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -15,11 +15,11 @@ */ #ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND -#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 +#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 // originally 5 #endif #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT -#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 // originally 5 +#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 15 // originally 5 #endif #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND @@ -31,7 +31,7 @@ #endif #ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5 +#define CPPHTTPLIB_READ_TIMEOUT_SECOND 15 // originally 5 #endif #ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND @@ -47,7 +47,7 @@ #endif #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND -#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0 +#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 5 // originally 0 #endif #ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND diff --git a/examples/server/json.hpp b/examples/server/json.hpp index ea945f346..e4c70510b 100644 --- a/examples/server/json.hpp +++ b/examples/server/json.hpp @@ -20312,7 +20312,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec basic_json(const basic_json& other) : m_type(other.m_type) { - // check of passed value is valid + // check if passed value is valid other.assert_invariant(); switch (m_type) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index aa9a3b991..d09277686 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3080,7 +3080,7 @@ int main(int argc, char **argv) std::atomic state{SERVER_STATE_LOADING_MODEL}; - svr.set_default_headers({{"Server", "PJllama.cpp"}}); + svr.set_default_headers({{"Server", "llama.cpp"}}); // CORS preflight (Cross-Origin Resource Sharing) svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) @@ -3566,6 +3566,22 @@ int main(int argc, char **argv) } ); // get a SIGABORT error on exception here as GG says above when in Debug but not in Release + // copied from a later server.cpp + llama.queue_tasks.on_new_task(std::bind( + &llama_server_context::process_single_task, &llama, std::placeholders::_1)); + llama.queue_tasks.on_finish_multitask(std::bind( + &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); + llama.queue_tasks.on_all_tasks_finished(std::bind( + &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_results.on_multitask_update(std::bind( + &llama_server_queue::update_multitask, + &llama.queue_tasks, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3 + )); + llama.queue_tasks.start_loop(); + t2.join(); // was originally t.join() despite t2 in line 3533 above llama_backend_free(); diff --git a/llama.cpp b/llama.cpp index d39ff94c7..2dfa9199d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6718,7 +6718,7 @@ static int llama_decode_internal( // if we start defragmenting the cache, the benefit from this will be more important kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); //kv_self.n = llama_kv_cache_cell_max(kv_self); - + // line above and below originally commented out //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_allocr_reset(lctx.alloc); @@ -10741,7 +10741,7 @@ void llama_batch_free(struct llama_batch batch) { int32_t llama_decode( struct llama_context * ctx, - struct llama_batch batch) { + struct llama_batch batch) { const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); diff --git a/test.sh b/test.sh new file mode 100755 index 000000000..3f94491de --- /dev/null +++ b/test.sh @@ -0,0 +1,8 @@ +make -j tests +export MTL_DEBUG_LAYER=1 +export MTL_SHADER_VALIDATION=1 +export MTL_SHADER_VALIDATION_REPORT_TO_STDERR=1 +export MTL_SHADER_VALIDATION_FAIL_MODE=allow +export MTL_DEBUG_LAYER_VALIDATE_STORE_ACTIONS=1 +export MTL_DEBUG_LAYER_VALIDATE_LOAD_ACTIONS=1 +./tests/test-backend-ops -b Metal -o MUL_MAT