diff --git a/llama.cpp b/llama.cpp index 0de5e47b2..975ab0e4b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1061,7 +1061,7 @@ static void llama_model_load_internal( LLAMA_ASSERT(hparams.n_head % n_gqa == 0); hparams.n_head_kv = hparams.n_head / n_gqa; if (model.type == e_model::MODEL_65B && n_gqa == 8) { - fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d", __func__, n_gqa); model.type = e_model::MODEL_70B; hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model } @@ -1077,21 +1077,21 @@ static void llama_model_load_internal( //const uint32_t n_ff = 28672; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: format = %s", __func__, llama_file_version_name(file_version)); + LLAMA_LOG_INFO("%s: n_vocab = %u", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_ctx = %u", __func__, hparams.n_ctx); + LLAMA_LOG_INFO("%s: n_embd = %u", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_mult = %u", __func__, hparams.n_mult); + LLAMA_LOG_INFO("%s: n_head = %u", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: n_ff = %u", __func__, n_ff); + LLAMA_LOG_INFO("%s: freq_base = %.1f", __func__, hparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: ftype = %u (%s)", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); + LLAMA_LOG_INFO("%s: model size = %s", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -2629,7 +2629,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s int nx = tensor.ne.at(0); int ny = tensor.ne.at(1); if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.",nx,ny,QK_K); convert_incompatible_tensor = true; } } @@ -2864,7 +2864,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \