common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
2024-09-15 20:46:12 +03:00 · 2024-09-15 20:46:12 +03:00 · 6262d13e0b
commit 6262d13e0b
parent e6deac31f7
54 changed files with 2092 additions and 2419 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,7 +1,9 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"

+#include <algorithm>
 #include <array>
 #include <atomic>
 #include <cmath>
@ -41,7 +43,7 @@ static void write_logfile(
    }

    if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }

@ -49,7 +51,7 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
@ -58,7 +60,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    prob_history.resize(tokens.size());

    if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

    const int calc_chunk = n_ctx;

-    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                tokens.size(), n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }
@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    int count = 0;
    double nll = 0.0;

-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.ppl_stride;
        const int end   = start + calc_chunk;

        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);

        std::vector<float> logits;

@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                //LOG_ERR("%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
        }
+        LOG("\n");

-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
-            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
        }
-        fflush(stdout);
    }
-    printf("\n");
+    LOG("\n");

    return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    if (!params.logits_file.empty()) {
        logits_stream.open(params.logits_file.c_str(), std::ios::binary);
        if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
            return {};
        }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
    }

    auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            }

            if (llama_decode(ctx, batch)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_INF("%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            llama_synchronize(ctx);
            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total*n_chunk/n_seq);
            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
        }
+        LOG("\n");

        for (int seq = 0; seq < n_seq_batch; seq++) {
            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            // perplexity is e^(average negative log-likelihood)
            if (params.ppl_output_type == 0) {
-                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
            } else {
                double av = nll/count;
                double av2 = nll2/count - av*av;
                if (av2 > 0) av2 = sqrt(av2/(count-1));
-                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
            }
        }
-        fflush(stdout);

        logits.clear();
    }
-    printf("\n");
+    LOG("\n");

    nll2 /= count;
    nll /= count;
@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
    }

    llama_batch_free(batch);
@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<

        const int ret = llama_decode(ctx, batch_view);
        if (ret != 0) {
-            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
            return false;
        }

@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    }

    if (prompt_lines.size() % 6 != 0) {
-        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
        return;
    }

    size_t hs_task_count = prompt_lines.size()/6;
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+    LOG_INF("================================= is_spm = %d\n", is_spm);

    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        std::vector<llama_token> seq_tokens[4];
    };

-    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );

    // Select and read data from prompt lines
    std::vector<hs_data_t> hs_data(hs_task_count);
@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
    }

-    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);

-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");

    double acc = 0.0f;

@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return;
        }

@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                }
            }

-            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);

            // If the gold ending got the maximum logprobe add one accuracy point
            if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            }

            // Print the accumulated accuracy mean x 100
-            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
-            fflush(stdout);
+            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
        }

        i0 = i1 - 1;
@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

    llama_batch_free(batch);

-    printf("\n");
+    LOG("\n");
 }

 struct winogrande_entry {
@ -1060,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            }
        }
        if (ipos != 4) {
-            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
            continue;
        }
        auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@ -1074,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            if (sentence[where] == '_') break;
        }
        if (where == int(sentence.size())) {
-            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
            continue;
        }
        std::istringstream stream(answer.c_str());
        int i_answer; stream >> i_answer;
        if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
            continue;
        }
        result.emplace_back();
@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    auto data = load_winogrande_from_csv(params.prompt);
    if (data.empty()) {
-        fprintf(stderr, "%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
        return;
    }

-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());

    if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
        std::mt19937 rng(1);
        std::vector<int> aux(data.size());
        for (int i = 0; i < int(data.size()); ++i) {
@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        data = std::move(selected);
    }

-    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);

    for (auto & task : data) {
        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
    }

-    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return;
        }

@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            ++n_done;

            // print the accumulated accuracy mean x 100
-            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
-            fflush(stdout);
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
        }

        i0 = i1 - 1;
    }

-    printf("\n");
+    LOG("\n");

    if (n_done < 100) return;

    const float p = 1.f*n_correct/n_done;
    const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }

 static bool deserialize_string(std::istream & in, std::string & str) {
@ -1348,7 +1349,7 @@ struct multiple_choice_task {
 static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
        }
        return false;
    }
@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
    for (auto& answer : task.mc1.answers) {
        if (answer.empty()) {
            if (log_error) {
-                printf("%s: found empty answer\n", __func__);
+                LOG_ERR("%s: found empty answer\n", __func__);
            }
            return false;
        }
@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    uint32_t n_task;
    strstream.read((char *)&n_task, sizeof(n_task));
    if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
        return;
    }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
-        printf("%s: failed to read task positions from prompt\n", __func__);
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
        return;
    }

@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
        // Use all tasks
        tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
+        LOG_INF("%s: reading tasks", __func__);
        int n_dot = std::max((int) n_task/100, 1);
        int i = 0;
        for (auto& task : tasks) {
            ++i;
            if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
                return;
            }
-            if (i%n_dot == 0) printf(".");
+            if (i%n_dot == 0) LOG(".");
        }
-        printf("done\n");
+        LOG("done\n");
    }
    else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
        std::mt19937 rng(1);
        std::vector<int> aux(n_task);
        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            aux.pop_back();
            strstream.seekg(task_pos[idx], std::ios::beg);
            if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                return;
            }
        }
        n_task = params.multiple_choice_tasks;
    }

-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
+    LOG_INF("%s: preparing task data", __func__);
    if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
+        LOG("...");
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        for (auto& w : workers) w = std::thread(prepare);
        prepare();
        for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
+        LOG("done\n");
        int nbad = n_bad;
        if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
            return;
        }
    } else {
@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                return;
            }
            if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
+                LOG(".");
            }
        }
-        printf("done\n");
+        LOG("done\n");
    }

-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());

-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        }

        if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return;
        }

@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // compute the logprobs for each ending of the decoded tasks
        for (size_t i = i0; i < i1; ++i) {
            auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
            //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
+            //        LOG("%d", j+1);
            //    }
            //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);

            // get the logits of the last token of the common prefix
            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                size_t count = 1;
                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
                    ++count;
                    log_prob += eval_results[ir++];
                }
                cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
            }

            // Find the ending with maximum logprob
@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            ++n_done;

            // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
        }

        i0 = i1 - 1;
@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    p = 1.f*n_done/n_tot_answers;
    sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);

-    printf("\n");
+    LOG_INF("\n");
 }

 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
    }
    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
    if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
        return;
    }
    {
        char check[9]; check[8] = 0;
        in.read(check, 8);
        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
            return;
        }
    }
@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }

@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    in.read((char *)&n_vocab, sizeof(n_vocab));
    in.read((char *)&n_chunk, sizeof(n_chunk));
    if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

    std::vector<llama_token> tokens(n_ctx * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }

@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();

        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
            return;
        }

@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                return;
            }

@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+            LOG("%.2f minutes\n", total_seconds / 60.0);
        }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");

        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        p_diff_ptr += n_ctx - 1 - first;
        kld_ptr    += n_ctx - 1 - first;

-        printf("%4d", i+1);
+        LOG("%4d", i+1);

        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        const double ppl_val = exp(log_ppl.first);
        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);

        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);

        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);

        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
        const double p_diff_rms_val = sqrt(p_diff_mse.first);
        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

        double p_top_val = 1.*kld.n_same_top/kld.count;
        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);

-        printf("\n");
-
-        fflush(stdout);
+        LOG("\n");

        logits.clear();
    }
-    printf("\n");
+    LOG("\n");

    if (kld.count < 100) return; // we do not wish to do statistics on so few values

    std::sort(kld_values.begin(), kld_values.end());
    std::sort(p_diff_values.begin(), p_diff_values.end());

-    printf("====== Perplexity statistics ======\n");
+    LOG("====== Perplexity statistics ======\n");

    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
    const double ppl_val = exp(log_ppl.first);
    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);

    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
    const double ppl_base_val = exp(log_ppl_base.first);
    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);

    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);

    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);

    const double ppl_ratio_val = exp(log_ppl_ratio_val);
    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);

    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
    const double ppl_diff_val = ppl_val - ppl_base_val;
    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);

-    printf("\n");
+    LOG("\n");

-    printf("====== KL divergence statistics ======\n");
+    LOG("====== KL divergence statistics ======\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];

@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
    };

-    printf("Maximum KLD: %10.6f\n", kld_values.back());
-    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("Median  KLD: %10.6f\n", kld_median);
-    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    printf("Minimum KLD: %10.6f\n", kld_values.front());
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());

-    printf("\n");
+    LOG("\n");

-    printf("====== Token probability statistics ======\n");
+    LOG("====== Token probability statistics ======\n");

    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);

    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                               : p_diff_values[p_diff_values.size()/2];

-    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());

    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);

    const double p_diff_rms_val = sqrt(p_diff_mse.first);
    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

    const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
-
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
 }

 int main(int argc, char ** argv) {
@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    gpt_init();
+
    const int32_t n_ctx = params.n_ctx;

    if (n_ctx <= 0) {
-        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
        return 1;
    }

@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) {
    }

    if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                params.n_ctx, params.n_ctx + params.ppl_stride/2);
        params.n_ctx += params.ppl_stride/2;
    }

-    print_build_info();
-
    llama_backend_init();
    llama_numa_init(params.numa);

@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);

    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
    }

    struct results_perplexity results;
@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    LOG_TEE("\n");
+    LOG("\n");
    llama_perf_context_print(ctx);
+
    write_logfile(ctx, params, model, results);

    llama_free(ctx);