common : reimplement logging (#9418)
https://github.com/ggerganov/llama.cpp/pull/9418
This commit is contained in:
parent
e6deac31f7
commit
6262d13e0b
54 changed files with 2092 additions and 2419 deletions
|
@ -4,6 +4,7 @@
|
|||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
|
@ -83,7 +84,9 @@ static void print_date_time() {
|
|||
char buffer[80];
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||
|
||||
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
// Define a split string function to ...
|
||||
|
@ -106,6 +109,8 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// number of simultaneous "clients" to simulate
|
||||
const int32_t n_clients = params.n_parallel;
|
||||
|
||||
|
@ -120,12 +125,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("parallel", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
@ -138,23 +137,22 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// load the prompts from an external file if there are any
|
||||
if (params.prompt.empty()) {
|
||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||
} else {
|
||||
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||
int index = 0;
|
||||
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||
|
||||
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||
for (const auto& prompt : prompts) {
|
||||
k_prompts.resize(index + 1);
|
||||
k_prompts[index] = prompt;
|
||||
index++;
|
||||
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
fflush(stderr);
|
||||
LOG_INF("\n\n");
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
|
@ -183,19 +181,19 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_INF("\n");
|
||||
|
||||
{
|
||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
||||
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||
|
||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -204,10 +202,10 @@ int main(int argc, char ** argv) {
|
|||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
LOG_TEE("Processing requests ...\n\n");
|
||||
LOG_INF("Processing requests ...\n\n");
|
||||
|
||||
while (true) {
|
||||
if (dump_kv_cache) {
|
||||
|
@ -238,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
||||
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||
}
|
||||
|
||||
// insert new sequences for decoding
|
||||
|
@ -273,7 +271,7 @@ int main(int argc, char ** argv) {
|
|||
client.n_decoded = 0;
|
||||
client.i_batch = batch.n_tokens - 1;
|
||||
|
||||
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
|
||||
g_seq_id += 1;
|
||||
|
||||
|
@ -317,11 +315,11 @@ int main(int argc, char ** argv) {
|
|||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
n_cache_miss += 1;
|
||||
|
||||
|
@ -332,7 +330,7 @@ int main(int argc, char ** argv) {
|
|||
continue;
|
||||
}
|
||||
|
||||
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
|
||||
for (auto & client : clients) {
|
||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||
|
@ -377,7 +375,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||
(t_main_end - client.t_start_prompt) / 1e6,
|
||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||
|
@ -400,19 +398,19 @@ int main(int argc, char ** argv) {
|
|||
|
||||
print_date_time();
|
||||
|
||||
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
if (params.prompt_file.empty()) {
|
||||
params.prompt_file = "used built-in defaults";
|
||||
}
|
||||
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
|
||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Cache misses: %6d\n", n_cache_miss);
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
|
||||
// TODO: print sampling/grammar timings for all clients
|
||||
llama_perf_context_print(ctx);
|
||||
|
@ -424,7 +422,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue