From e3e86419ef8d5351c213cc1aa7a1979539eaf434 Mon Sep 17 00:00:00 2001
From: Wenjing Yu <zihao.chen31@gmail.com>
Date: Fri, 5 Jul 2024 15:58:54 -0700
Subject: [PATCH] goto production

---
 common/common.cpp           |  2 +-
 examples/main/main.cpp      | 32 +++++++++++++-----
 examples/rpc/rpc-server.cpp | 20 ++++++++++-
 ggml/src/ggml-cuda.cu       |  4 +--
 ggml/src/ggml-metal.m       | 12 +++----
 ggml/src/ggml-rpc.cpp       | 35 ++++++++++++++++++--
 src/llama.cpp               | 66 ++++++++++++++++++++-----------------
 7 files changed, 119 insertions(+), 52 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c548bcb28..9498ecf88 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1687,7 +1687,7 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
-    return os.str();
+    return "";
 }
 
 //
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4ef55c1e6..6dbb2188a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -127,6 +127,21 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
     return formatted;
 }
 
+void printAntigmaLogo() {
+    std::cout << R"(
+
+                                                                           
+   _|_|    _|      _|  _|_|_|_|_|  _|_|_|    _|_|_|  _|      _|    _|_|    
+ _|    _|  _|_|    _|      _|        _|    _|        _|_|  _|_|  _|    _|  
+ _|_|_|_|  _|  _|  _|      _|        _|    _|  _|_|  _|  _|  _|  _|_|_|_|  
+ _|    _|  _|    _|_|      _|        _|    _|    _|  _|      _|  _|    _|  
+ _|    _|  _|      _|      _|      _|_|_|    _|_|_|  _|      _|  _|    _|  
+                                                                           
+                                                                           
+                                                   
+    )" << '\n';
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
@@ -140,7 +155,7 @@ int main(int argc, char ** argv) {
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
+    // LOG_TEE("Log start\n");
     log_dump_cmdline(argc, argv);
     llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
@@ -182,14 +197,15 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    // LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    printAntigmaLogo();
+    LOG_TEE("Starting with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
 
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
+    // LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
 
@@ -452,9 +468,9 @@ int main(int argc, char ** argv) {
             }
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    // LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    // LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
+    // LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
     // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -981,7 +997,7 @@ int main(int argc, char ** argv) {
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
+    //LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
 
     return 0;
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 7c15d2aa4..243f7546f 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -14,6 +14,8 @@
 #endif
 #include <string>
 #include <stdio.h>
+#include <iostream>
+
 
 struct rpc_server_params {
     std::string host        = "0.0.0.0";
@@ -65,8 +67,24 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
     return true;
 }
 
+void printAntigmaLogo() {
+    std::cout << R"(
+
+                                                                           
+   _|_|    _|      _|  _|_|_|_|_|  _|_|_|    _|_|_|  _|      _|    _|_|    
+ _|    _|  _|_|    _|      _|        _|    _|        _|_|  _|_|  _|    _|  
+ _|_|_|_|  _|  _|  _|      _|        _|    _|  _|_|  _|  _|  _|  _|_|_|_|  
+ _|    _|  _|    _|_|      _|        _|    _|    _|  _|      _|  _|    _|  
+ _|    _|  _|      _|      _|      _|_|_|    _|_|_|  _|      _|  _|    _|  
+                                                                           
+                                                                           
+                                                   
+    )" << '\n';
+}
+
 static ggml_backend_t create_backend() {
     ggml_backend_t backend = NULL;
+    printAntigmaLogo();
 #ifdef GGML_USE_CUDA
     fprintf(stderr, "%s: using CUDA backend\n", __func__);
     backend = ggml_backend_cuda_init(0); // init device 0
@@ -127,7 +145,7 @@ int main(int argc, char * argv[]) {
     } else {
         get_backend_memory(&free_mem, &total_mem);
     }
-    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
+    printf("\nStarting Antigma node on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
     start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
     ggml_backend_free(backend);
     return 0;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 1c9ccc8a1..f074749ec 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -133,7 +133,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 #endif
 }
 
-static ggml_cuda_device_info ggml_cuda_init() {
+static ggml_cuda_device_info cuda_init() {
 #ifdef __HIP_PLATFORM_AMD__
     // Workaround for a rocBLAS bug when using multiple graphics cards:
     // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
@@ -210,7 +210,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 }
 
 const ggml_cuda_device_info & ggml_cuda_info() {
-    static ggml_cuda_device_info info = ggml_cuda_init();
+    static ggml_cuda_device_info info = cuda_init();
     return info;
 }
 
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 79902c9a8..23b06fea7 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -289,7 +289,7 @@ static void * ggml_metal_host_malloc(size_t n) {
     return data;
 }
 
-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static struct ggml_metal_context * metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@@ -669,7 +669,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 }
 
 static void ggml_metal_free(struct ggml_metal_context * ctx) {
-    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
+    //GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
 
     for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
         [ctx->kernels[i].pipeline release];
@@ -2975,8 +2975,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 #ifndef GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
-                __func__,
+        GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0,
                 device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@@ -2987,8 +2986,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
             GGML_METAL_LOG_INFO("\n");
         }
     } else {
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
-                __func__,
+        GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f)\n",
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0);
     }
@@ -3219,7 +3217,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
 }
 
 ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+    struct ggml_metal_context * ctx = metal_init(GGML_DEFAULT_N_THREADS);
 
     if (ctx == NULL) {
         return NULL;
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index b01ad2674..7937e4839 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -26,6 +26,10 @@
 #  include <unistd.h>
 #endif
 #include <string.h>
+#include <iostream>
+#include <thread>
+#include <atomic>
+#include <chrono>
 
 #define UNUSED GGML_UNUSED
 
@@ -1141,6 +1145,24 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
     }
 }
 
+// Function to update the loading bar
+void loading_bar(std::atomic<bool>& stop_loading) {
+    const char spinner[] = "|/-\\";
+    int pos = 0;
+
+    while (!stop_loading.load()) { // Keep running until the main thread signals to stop
+        std::cout << "\r" << spinner[pos] << " loading and computing tensor" << std::flush;
+        pos = (pos + 1) % 4;
+        std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Update every 100ms
+    }
+}
+
+// Function to simulate rpc_serve_client execution
+void mock_rpc_serve_client() {
+    // Simulate a long-running task
+    std::this_thread::sleep_for(std::chrono::seconds(10));
+}
+
 void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
     std::string host;
     int port;
@@ -1164,13 +1186,22 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
     }
     while (true) {
         auto client_socket = socket_accept(server_socket->fd);
+        std::atomic<bool> stop_loading(false);
         if (client_socket == nullptr) {
             fprintf(stderr, "Failed to accept client connection\n");
             return;
         }
-        printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
+        printf("Incoming a new accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
+        // Create a thread to run the loading bar
+        std::thread loading_thread(loading_bar, std::ref(stop_loading));
         rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
-        printf("Client connection closed\n");
+        // mock_rpc_serve_client();
+        // Signal the loading bar thread to stop and wait for it to finish
+        stop_loading = true;
+        loading_thread.join();
+        printf("\n");
+        printf("Task is done!\n");
+        printf("Client connection closed\n\n");
     }
 #ifdef _WIN32
     WSACleanup();
diff --git a/src/llama.cpp b/src/llama.cpp
index b770ca5bc..e9fb3c256 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2999,7 +2999,7 @@ static bool llama_kv_cache_init(
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        // LLAMA_LOG_INFO("%s: %10s KV cache size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         cache.bufs.push_back(buf);
     }
 
@@ -3709,8 +3709,8 @@ struct llama_model_loader {
             tensor_names.insert(name);
         }
 
-        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-                __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+        //LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
+                //__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
 
         // determine file type based on the number of tensors for each quantization and print meta data
         // TODO: make optional
@@ -3777,7 +3777,7 @@ struct llama_model_loader {
                 }
             }
 
-            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+            // LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
             for (int i = 0; i < n_kv; i++) {
                 const char * name           = gguf_get_key(meta, i);
@@ -3794,7 +3794,7 @@ struct llama_model_loader {
                 }
                 replace_all(value, "\n", "\\n");
 
-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+                //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
             }
 
             // print type counts
@@ -3803,7 +3803,7 @@ struct llama_model_loader {
                     continue;
                 }
 
-                LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+                //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
             }
         }
 
@@ -5617,7 +5617,7 @@ static void llm_load_vocab(
             }
         );
 
-        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
+        // LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
     }
 
     // build token to piece cache
@@ -5634,7 +5634,7 @@ static void llm_load_vocab(
 
         std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
 
-        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("Token to piece cache size = %.4f MB\n", size_cache / 1024.0 / 1024.0);
     }
 
     // Handle per token attributes
@@ -5726,6 +5726,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
         return ss.str();
     };
 
+    /*
     // hparams
     LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
     LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch));
@@ -5820,10 +5821,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
         LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
         LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
     }
+    */
 }
 
 // Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
+static bool antigma_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
@@ -7627,7 +7629,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
 #endif
 
-        if (!llm_load_tensors(
+        if (!antigma_load_tensors(
             ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
         )) {
@@ -18831,12 +18833,14 @@ struct llama_context * llama_new_context_with_model(
         params.seed = time(NULL);
     }
 
+    /*
     LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: flash_attn = %d\n",     __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
+    */
 
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;
@@ -19003,10 +19007,10 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_v += ggml_nbytes(v);
             }
 
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+            // LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                //(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                //ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                //ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
 
         // graph outputs buffer
@@ -19018,9 +19022,9 @@ struct llama_context * llama_new_context_with_model(
                 return nullptr;
             }
 
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_output),
-                    ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
+            //LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    //ggml_backend_buffer_name(ctx->buf_output),
+                    //ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
         }
 
         // scheduler and compute buffers
@@ -19053,7 +19057,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
 
             if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
+                //LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
             }
 
             // build worst-case graph
@@ -19074,16 +19078,16 @@ struct llama_context * llama_new_context_with_model(
                 ggml_backend_buffer_type_t buft = backend_buft[i];
                 size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
                 if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                            ggml_backend_buft_name(buft),
-                            size / 1024.0 / 1024.0);
+                    // LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                            // ggml_backend_buft_name(buft),
+                            // size / 1024.0 / 1024.0);
                 }
             }
 
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
-            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
+            LLAMA_LOG_INFO("TENSORBLOCK graph nodes  = %d\n", gf->n_nodes);
+            LLAMA_LOG_INFO("TENSORBLOCK graph splits = %d\n", n_splits);
         }
     }
 
@@ -21418,14 +21422,14 @@ void llama_print_timings(struct llama_context * ctx) {
     const llama_timings timings = llama_get_timings(ctx);
 
     LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
+    // LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
+    // LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    //         __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
+    // LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+    //         __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
+    // LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    //         __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+    LLAMA_LOG_INFO("Antigma timer:       total time = %10.2f ms / %5d tokens\n", (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
 }
 
 void llama_reset_timings(struct llama_context * ctx) {