Add comment for llama_log_callback and replace remaining printf calls
This commit is contained in:
parent
c857a33b19
commit
98369f62c5
2 changed files with 27 additions and 21 deletions
42
llama.cpp
42
llama.cpp
|
@ -1766,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
||||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||||
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||||
|
@ -1827,7 +1827,7 @@ static bool llama_eval_internal(
|
||||||
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
|
@ -2014,7 +2014,7 @@ struct llama_tokenizer {
|
||||||
left_sym.n += right_sym.n;
|
left_sym.n += right_sym.n;
|
||||||
right_sym.n = 0;
|
right_sym.n = 0;
|
||||||
|
|
||||||
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
||||||
|
|
||||||
// remove the right sym from the chain
|
// remove the right sym from the chain
|
||||||
left_sym.next = right_sym.next;
|
left_sym.next = right_sym.next;
|
||||||
|
@ -3022,7 +3022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
tensor.data = read_data.addr;
|
tensor.data = read_data.addr;
|
||||||
model_loader->load_data_for(tensor);
|
model_loader->load_data_for(tensor);
|
||||||
|
|
||||||
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
||||||
++idx, model_loader->tensors_map.tensors.size(),
|
++idx, model_loader->tensors_map.tensors.size(),
|
||||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||||
ggml_type_name(tensor.type));
|
ggml_type_name(tensor.type));
|
||||||
|
@ -3044,7 +3044,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type = tensor.type;
|
new_type = tensor.type;
|
||||||
new_data = tensor.data;
|
new_data = tensor.data;
|
||||||
new_size = tensor.size;
|
new_size = tensor.size;
|
||||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
@ -3109,7 +3109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
f32_data = (float *) f32_conv_buf.addr;
|
f32_data = (float *) f32_conv_buf.addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
work.resize(nelements * 4); // upper bound on size
|
work.resize(nelements * 4); // upper bound on size
|
||||||
|
@ -3159,7 +3159,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
int64_t tot_count = 0;
|
int64_t tot_count = 0;
|
||||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||||
hist_all[i] += hist_cur[i];
|
hist_all[i] += hist_cur[i];
|
||||||
|
@ -3168,18 +3168,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
if (tot_count > 0) {
|
if (tot_count > 0) {
|
||||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||||
printf("%5.3f ", hist_cur[i] / float(nelements));
|
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
total_size_org += tensor.size;
|
total_size_org += tensor.size;
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
file_saver.write_tensor(tensor, new_type, new_data, new_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||||
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||||
|
|
||||||
{
|
{
|
||||||
int64_t sum_all = 0;
|
int64_t sum_all = 0;
|
||||||
|
@ -3188,11 +3188,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sum_all > 0) {
|
if (sum_all > 0) {
|
||||||
printf("%s: hist: ", __func__);
|
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
||||||
for (size_t i = 0; i < hist_all.size(); i++) {
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
||||||
printf("%5.3f ", hist_all[i] / float(sum_all));
|
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
||||||
}
|
}
|
||||||
printf("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3250,10 +3250,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
unsigned percentage = (unsigned) (100 * progress);
|
unsigned percentage = (unsigned) (100 * progress);
|
||||||
while (percentage > *cur_percentage_p) {
|
while (percentage > *cur_percentage_p) {
|
||||||
*cur_percentage_p = percentage;
|
*cur_percentage_p = percentage;
|
||||||
fprintf(stderr, ".");
|
LLAMA_LOG_INFO(".");
|
||||||
fflush(stderr);
|
|
||||||
if (percentage >= 100) {
|
if (percentage >= 100) {
|
||||||
fprintf(stderr, "\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -3308,14 +3307,14 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// measure memory requirements for the graph
|
// measure memory requirements for the graph
|
||||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||||
|
|
||||||
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||||
|
|
||||||
// debug - for comparison with scratch buffer
|
// debug - for comparison with scratch buffer
|
||||||
//size_t prev_req =
|
//size_t prev_req =
|
||||||
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
||||||
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
||||||
// MEM_REQ_EVAL().at(ctx->model.type);
|
// MEM_REQ_EVAL().at(ctx->model.type);
|
||||||
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
//LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
||||||
|
|
||||||
// recreate allocator with exact memory requirements
|
// recreate allocator with exact memory requirements
|
||||||
ggml_allocr_free(ctx->alloc);
|
ggml_allocr_free(ctx->alloc);
|
||||||
|
@ -3679,7 +3678,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
|
|
||||||
n_tensors++;
|
n_tensors++;
|
||||||
if (n_tensors % 4 == 0) {
|
if (n_tensors % 4 == 0) {
|
||||||
fprintf(stderr, ".");
|
LLAMA_LOG_INFO(".");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4316,5 +4315,6 @@ static void llama_log_internal(llama_log_level level, const char * format, ...)
|
||||||
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
||||||
(void) level;
|
(void) level;
|
||||||
(void) user_data;
|
(void) user_data;
|
||||||
fprintf(stderr, "%s", text);
|
fputs(text, stderr);
|
||||||
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -91,6 +91,12 @@ extern "C" {
|
||||||
LLAMA_LOG_LEVEL_WARN = 3,
|
LLAMA_LOG_LEVEL_WARN = 3,
|
||||||
LLAMA_LOG_LEVEL_INFO = 4
|
LLAMA_LOG_LEVEL_INFO = 4
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Signature for logging events
|
||||||
|
// Note that text includes the new line character at the end for most events.
|
||||||
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||||
|
// if it exists.
|
||||||
|
// It might not exist for progress report where '.' is output repeatedly.
|
||||||
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue