From e3e86419ef8d5351c213cc1aa7a1979539eaf434 Mon Sep 17 00:00:00 2001 From: Wenjing Yu Date: Fri, 5 Jul 2024 15:58:54 -0700 Subject: [PATCH] goto production --- common/common.cpp | 2 +- examples/main/main.cpp | 32 +++++++++++++----- examples/rpc/rpc-server.cpp | 20 ++++++++++- ggml/src/ggml-cuda.cu | 4 +-- ggml/src/ggml-metal.m | 12 +++---- ggml/src/ggml-rpc.cpp | 35 ++++++++++++++++++-- src/llama.cpp | 66 ++++++++++++++++++++----------------- 7 files changed, 119 insertions(+), 52 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c548bcb28..9498ecf88 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1687,7 +1687,7 @@ std::string gpt_params_get_system_info(const gpt_params & params) { } os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); - return os.str(); + return ""; } // diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4ef55c1e6..6dbb2188a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -127,6 +127,21 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector 1) @@ -981,7 +997,7 @@ int main(int argc, char ** argv) { llama_backend_free(); #ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); + //LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS return 0; diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4..243f7546f 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -14,6 +14,8 @@ #endif #include #include +#include + struct rpc_server_params { std::string host = "0.0.0.0"; @@ -65,8 +67,24 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & return true; } +void printAntigmaLogo() { + std::cout << R"( + + + _|_| _| _| _|_|_|_|_| _|_|_| _|_|_| _| _| _|_| + _| _| _|_| _| _| _| _| _|_| _|_| _| _| + _|_|_|_| _| _| _| _| _| _| _|_| _| _| _| _|_|_|_| + _| _| _| _|_| _| _| _| _| _| _| _| _| + _| _| _| _| _| _|_|_| _|_|_| _| _| _| _| + + + + )" << '\n'; +} + static ggml_backend_t create_backend() { ggml_backend_t backend = NULL; + printAntigmaLogo(); #ifdef GGML_USE_CUDA fprintf(stderr, "%s: using CUDA backend\n", __func__); backend = ggml_backend_cuda_init(0); // init device 0 @@ -127,7 +145,7 @@ int main(int argc, char * argv[]) { } else { get_backend_memory(&free_mem, &total_mem); } - printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); + printf("\nStarting Antigma node on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); ggml_backend_free(backend); return 0; diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 1c9ccc8a1..f074749ec 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -133,7 +133,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) #endif } -static ggml_cuda_device_info ggml_cuda_init() { +static ggml_cuda_device_info cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 @@ -210,7 +210,7 @@ static ggml_cuda_device_info ggml_cuda_init() { } const ggml_cuda_device_info & ggml_cuda_info() { - static ggml_cuda_device_info info = ggml_cuda_init(); + static ggml_cuda_device_info info = cuda_init(); return info; } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 79902c9a8..23b06fea7 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -289,7 +289,7 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } -static struct ggml_metal_context * ggml_metal_init(int n_cb) { +static struct ggml_metal_context * metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG @@ -669,7 +669,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { } static void ggml_metal_free(struct ggml_metal_context * ctx) { - GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); + //GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { [ctx->kernels[i].pipeline release]; @@ -2975,8 +2975,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", - __func__, + GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); @@ -2987,8 +2986,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s GGML_METAL_LOG_INFO("\n"); } } else { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", - __func__, + GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f)\n", size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0); } @@ -3219,7 +3217,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) { } ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + struct ggml_metal_context * ctx = metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { return NULL; diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index b01ad2674..7937e4839 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -26,6 +26,10 @@ # include #endif #include +#include +#include +#include +#include #define UNUSED GGML_UNUSED @@ -1141,6 +1145,24 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre } } +// Function to update the loading bar +void loading_bar(std::atomic& stop_loading) { + const char spinner[] = "|/-\\"; + int pos = 0; + + while (!stop_loading.load()) { // Keep running until the main thread signals to stop + std::cout << "\r" << spinner[pos] << " loading and computing tensor" << std::flush; + pos = (pos + 1) % 4; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Update every 100ms + } +} + +// Function to simulate rpc_serve_client execution +void mock_rpc_serve_client() { + // Simulate a long-running task + std::this_thread::sleep_for(std::chrono::seconds(10)); +} + void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { std::string host; int port; @@ -1164,13 +1186,22 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free } while (true) { auto client_socket = socket_accept(server_socket->fd); + std::atomic stop_loading(false); if (client_socket == nullptr) { fprintf(stderr, "Failed to accept client connection\n"); return; } - printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); + printf("Incoming a new accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); + // Create a thread to run the loading bar + std::thread loading_thread(loading_bar, std::ref(stop_loading)); rpc_serve_client(backend, client_socket->fd, free_mem, total_mem); - printf("Client connection closed\n"); + // mock_rpc_serve_client(); + // Signal the loading bar thread to stop and wait for it to finish + stop_loading = true; + loading_thread.join(); + printf("\n"); + printf("Task is done!\n"); + printf("Client connection closed\n\n"); } #ifdef _WIN32 WSACleanup(); diff --git a/src/llama.cpp b/src/llama.cpp index b770ca5bc..e9fb3c256 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2999,7 +2999,7 @@ static bool llama_kv_cache_init( return false; } ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + // LLAMA_LOG_INFO("%s: %10s KV cache size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); cache.bufs.push_back(buf); } @@ -3709,8 +3709,8 @@ struct llama_model_loader { tensor_names.insert(name); } - LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + //LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", + //__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional @@ -3777,7 +3777,7 @@ struct llama_model_loader { } } - LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + // LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(meta, i); @@ -3794,7 +3794,7 @@ struct llama_model_loader { } replace_all(value, "\n", "\\n"); - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -3803,7 +3803,7 @@ struct llama_model_loader { continue; } - LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -5617,7 +5617,7 @@ static void llm_load_vocab( } ); - LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); + // LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); } // build token to piece cache @@ -5634,7 +5634,7 @@ static void llm_load_vocab( std::swap(vocab.cache_token_to_piece, cache_token_to_piece); - LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); + LLAMA_LOG_INFO("Token to piece cache size = %.4f MB\n", size_cache / 1024.0 / 1024.0); } // Handle per token attributes @@ -5726,6 +5726,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { return ss.str(); }; + /* // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)); @@ -5820,10 +5821,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } + */ } // Returns false if cancelled by progress_callback -static bool llm_load_tensors( +static bool antigma_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -7627,7 +7629,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } #endif - if (!llm_load_tensors( + if (!antigma_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { @@ -18831,12 +18833,14 @@ struct llama_context * llama_new_context_with_model( params.seed = time(NULL); } + /* LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + */ ctx->abort_callback = params.abort_callback; ctx->abort_callback_data = params.abort_callback_data; @@ -19003,10 +19007,10 @@ struct llama_context * llama_new_context_with_model( memory_size_v += ggml_nbytes(v); } - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + // LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + //(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + //ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + //ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } // graph outputs buffer @@ -19018,9 +19022,9 @@ struct llama_context * llama_new_context_with_model( return nullptr; } - LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_output), - ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); + //LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + //ggml_backend_buffer_name(ctx->buf_output), + //ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); } // scheduler and compute buffers @@ -19053,7 +19057,7 @@ struct llama_context * llama_new_context_with_model( ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); + //LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); } // build worst-case graph @@ -19074,16 +19078,16 @@ struct llama_context * llama_new_context_with_model( ggml_backend_buffer_type_t buft = backend_buft[i]; size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + // LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + // ggml_backend_buft_name(buft), + // size / 1024.0 / 1024.0); } } // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); + LLAMA_LOG_INFO("TENSORBLOCK graph nodes = %d\n", gf->n_nodes); + LLAMA_LOG_INFO("TENSORBLOCK graph splits = %d\n", n_splits); } } @@ -21418,14 +21422,14 @@ void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); + // LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); + // LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + // __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); + // LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + // __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); + // LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + // __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); + LLAMA_LOG_INFO("Antigma timer: total time = %10.2f ms / %5d tokens\n", (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); } void llama_reset_timings(struct llama_context * ctx) {