goto production

This commit is contained in:
Wenjing Yu 2024-07-05 15:58:54 -07:00
parent 213701b51a
commit e3e86419ef
7 changed files with 119 additions and 52 deletions

View file

@ -1687,7 +1687,7 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
} }
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
return os.str(); return "";
} }
// //

View file

@ -127,6 +127,21 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
return formatted; return formatted;
} }
void printAntigmaLogo() {
std::cout << R"(
_|_| _| _| _|_|_|_|_| _|_|_| _|_|_| _| _| _|_|
_| _| _|_| _| _| _| _| _|_| _|_| _| _|
_|_|_|_| _| _| _| _| _| _| _|_| _| _| _| _|_|_|_|
_| _| _| _|_| _| _| _| _| _| _| _| _|
_| _| _| _| _| _|_|_| _|_|_| _| _| _| _|
)" << '\n';
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
@ -140,7 +155,7 @@ int main(int argc, char ** argv) {
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log")); log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n"); // LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv); log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr); llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
@ -182,14 +197,15 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); // LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); printAntigmaLogo();
LOG_TEE("Starting with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
} }
LOG_TEE("%s: seed = %u\n", __func__, params.seed); // LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
@ -452,9 +468,9 @@ int main(int argc, char ** argv) {
} }
} }
} }
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); // LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); // LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state // group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1) // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -981,7 +997,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n"); //LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
return 0; return 0;

View file

@ -14,6 +14,8 @@
#endif #endif
#include <string> #include <string>
#include <stdio.h> #include <stdio.h>
#include <iostream>
struct rpc_server_params { struct rpc_server_params {
std::string host = "0.0.0.0"; std::string host = "0.0.0.0";
@ -65,8 +67,24 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
return true; return true;
} }
void printAntigmaLogo() {
std::cout << R"(
_|_| _| _| _|_|_|_|_| _|_|_| _|_|_| _| _| _|_|
_| _| _|_| _| _| _| _| _|_| _|_| _| _|
_|_|_|_| _| _| _| _| _| _| _|_| _| _| _| _|_|_|_|
_| _| _| _|_| _| _| _| _| _| _| _| _|
_| _| _| _| _| _|_|_| _|_|_| _| _| _| _|
)" << '\n';
}
static ggml_backend_t create_backend() { static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL; ggml_backend_t backend = NULL;
printAntigmaLogo();
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__); fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0 backend = ggml_backend_cuda_init(0); // init device 0
@ -127,7 +145,7 @@ int main(int argc, char * argv[]) {
} else { } else {
get_backend_memory(&free_mem, &total_mem); get_backend_memory(&free_mem, &total_mem);
} }
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); printf("\nStarting Antigma node on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend); ggml_backend_free(backend);
return 0; return 0;

View file

@ -133,7 +133,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
#endif #endif
} }
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info cuda_init() {
#ifdef __HIP_PLATFORM_AMD__ #ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards: // Workaround for a rocBLAS bug when using multiple graphics cards:
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
@ -210,7 +210,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
} }
const ggml_cuda_device_info & ggml_cuda_info() { const ggml_cuda_device_info & ggml_cuda_info() {
static ggml_cuda_device_info info = ggml_cuda_init(); static ggml_cuda_device_info info = cuda_init();
return info; return info;
} }

View file

@ -289,7 +289,7 @@ static void * ggml_metal_host_malloc(size_t n) {
return data; return data;
} }
static struct ggml_metal_context * ggml_metal_init(int n_cb) { static struct ggml_metal_context * metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: allocating\n", __func__); GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@ -669,7 +669,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
} }
static void ggml_metal_free(struct ggml_metal_context * ctx) { static void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); //GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
[ctx->kernels[i].pipeline release]; [ctx->kernels[i].pipeline release];
@ -2975,8 +2975,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
#ifndef GGML_METAL_NDEBUG #ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) { if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
__func__,
size_aligned / 1024.0 / 1024.0, size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0,
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -2987,8 +2986,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
} else { } else {
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", GGML_METAL_LOG_INFO("allocated buffer, size = %8.2f MiB, (%8.2f)\n",
__func__,
size_aligned / 1024.0 / 1024.0, size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0); device.currentAllocatedSize / 1024.0 / 1024.0);
} }
@ -3219,7 +3217,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
} }
ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t ggml_backend_metal_init(void) {
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); struct ggml_metal_context * ctx = metal_init(GGML_DEFAULT_N_THREADS);
if (ctx == NULL) { if (ctx == NULL) {
return NULL; return NULL;

View file

@ -26,6 +26,10 @@
# include <unistd.h> # include <unistd.h>
#endif #endif
#include <string.h> #include <string.h>
#include <iostream>
#include <thread>
#include <atomic>
#include <chrono>
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
@ -1141,6 +1145,24 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
} }
} }
// Function to update the loading bar
void loading_bar(std::atomic<bool>& stop_loading) {
const char spinner[] = "|/-\\";
int pos = 0;
while (!stop_loading.load()) { // Keep running until the main thread signals to stop
std::cout << "\r" << spinner[pos] << " loading and computing tensor" << std::flush;
pos = (pos + 1) % 4;
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Update every 100ms
}
}
// Function to simulate rpc_serve_client execution
void mock_rpc_serve_client() {
// Simulate a long-running task
std::this_thread::sleep_for(std::chrono::seconds(10));
}
void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
std::string host; std::string host;
int port; int port;
@ -1164,13 +1186,22 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
} }
while (true) { while (true) {
auto client_socket = socket_accept(server_socket->fd); auto client_socket = socket_accept(server_socket->fd);
std::atomic<bool> stop_loading(false);
if (client_socket == nullptr) { if (client_socket == nullptr) {
fprintf(stderr, "Failed to accept client connection\n"); fprintf(stderr, "Failed to accept client connection\n");
return; return;
} }
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); printf("Incoming a new accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
// Create a thread to run the loading bar
std::thread loading_thread(loading_bar, std::ref(stop_loading));
rpc_serve_client(backend, client_socket->fd, free_mem, total_mem); rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
printf("Client connection closed\n"); // mock_rpc_serve_client();
// Signal the loading bar thread to stop and wait for it to finish
stop_loading = true;
loading_thread.join();
printf("\n");
printf("Task is done!\n");
printf("Client connection closed\n\n");
} }
#ifdef _WIN32 #ifdef _WIN32
WSACleanup(); WSACleanup();

View file

@ -2999,7 +2999,7 @@ static bool llama_kv_cache_init(
return false; return false;
} }
ggml_backend_buffer_clear(buf, 0); ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); // LLAMA_LOG_INFO("%s: %10s KV cache size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
cache.bufs.push_back(buf); cache.bufs.push_back(buf);
} }
@ -3709,8 +3709,8 @@ struct llama_model_loader {
tensor_names.insert(name); tensor_names.insert(name);
} }
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", //LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); //__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
// determine file type based on the number of tensors for each quantization and print meta data // determine file type based on the number of tensors for each quantization and print meta data
// TODO: make optional // TODO: make optional
@ -3777,7 +3777,7 @@ struct llama_model_loader {
} }
} }
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); // LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(meta, i); const char * name = gguf_get_key(meta, i);
@ -3794,7 +3794,7 @@ struct llama_model_loader {
} }
replace_all(value, "\n", "\\n"); replace_all(value, "\n", "\\n");
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -3803,7 +3803,7 @@ struct llama_model_loader {
continue; continue;
} }
LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
@ -5617,7 +5617,7 @@ static void llm_load_vocab(
} }
); );
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); // LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
} }
// build token to piece cache // build token to piece cache
@ -5634,7 +5634,7 @@ static void llm_load_vocab(
std::swap(vocab.cache_token_to_piece, cache_token_to_piece); std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); LLAMA_LOG_INFO("Token to piece cache size = %.4f MB\n", size_cache / 1024.0 / 1024.0);
} }
// Handle per token attributes // Handle per token attributes
@ -5726,6 +5726,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
return ss.str(); return ss.str();
}; };
/*
// hparams // hparams
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
@ -5820,10 +5821,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
} }
*/
} }
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
static bool llm_load_tensors( static bool antigma_load_tensors(
llama_model_loader & ml, llama_model_loader & ml,
llama_model & model, llama_model & model,
int n_gpu_layers, int n_gpu_layers,
@ -7627,7 +7629,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
} }
#endif #endif
if (!llm_load_tensors( if (!antigma_load_tensors(
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data params.progress_callback, params.progress_callback_user_data
)) { )) {
@ -18831,12 +18833,14 @@ struct llama_context * llama_new_context_with_model(
params.seed = time(NULL); params.seed = time(NULL);
} }
/*
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
*/
ctx->abort_callback = params.abort_callback; ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data; ctx->abort_callback_data = params.abort_callback_data;
@ -19003,10 +19007,10 @@ struct llama_context * llama_new_context_with_model(
memory_size_v += ggml_nbytes(v); memory_size_v += ggml_nbytes(v);
} }
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, // LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), //(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), //ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); //ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
} }
// graph outputs buffer // graph outputs buffer
@ -19018,9 +19022,9 @@ struct llama_context * llama_new_context_with_model(
return nullptr; return nullptr;
} }
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, //LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_output), //ggml_backend_buffer_name(ctx->buf_output),
ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); //ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
} }
// scheduler and compute buffers // scheduler and compute buffers
@ -19053,7 +19057,7 @@ struct llama_context * llama_new_context_with_model(
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
if (pipeline_parallel) { if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); //LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
} }
// build worst-case graph // build worst-case graph
@ -19074,16 +19078,16 @@ struct llama_context * llama_new_context_with_model(
ggml_backend_buffer_type_t buft = backend_buft[i]; ggml_backend_buffer_type_t buft = backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
if (size > 1) { if (size > 1) {
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, // LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft), // ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0); // size / 1024.0 / 1024.0);
} }
} }
// note: the number of splits during measure is higher than during inference due to the kv shift // note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); LLAMA_LOG_INFO("TENSORBLOCK graph nodes = %d\n", gf->n_nodes);
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); LLAMA_LOG_INFO("TENSORBLOCK graph splits = %d\n", n_splits);
} }
} }
@ -21418,14 +21422,14 @@ void llama_print_timings(struct llama_context * ctx) {
const llama_timings timings = llama_get_timings(ctx); const llama_timings timings = llama_get_timings(ctx);
LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); // LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", // LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); // __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", // LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); // __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", // LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); // __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); LLAMA_LOG_INFO("Antigma timer: total time = %10.2f ms / %5d tokens\n", (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
} }
void llama_reset_timings(struct llama_context * ctx) { void llama_reset_timings(struct llama_context * ctx) {