Add verbose flag to control console output about model information on load.
This commit is contained in:
parent
447ccbe8c3
commit
75031d5c23
4 changed files with 51 additions and 31 deletions
|
@ -395,6 +395,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.input_suffix = argv[i];
|
params.input_suffix = argv[i];
|
||||||
|
} else if (arg == "--no-verbose") {
|
||||||
|
params.verbose = false;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argc, argv, default_params);
|
||||||
|
@ -501,6 +503,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
|
fprintf(stderr, " --no-verbose do not print model info on startup\n");
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -551,6 +554,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
lparams.use_mlock = params.use_mlock;
|
lparams.use_mlock = params.use_mlock;
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
lparams.embedding = params.embedding;
|
lparams.embedding = params.embedding;
|
||||||
|
lparams.verbose = params.verbose;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
|
|
@ -78,6 +78,7 @@ struct gpt_params {
|
||||||
bool mem_test = false; // compute maximum memory usage
|
bool mem_test = false; // compute maximum memory usage
|
||||||
bool export_cgraph = false; // export the computation graph
|
bool export_cgraph = false; // export the computation graph
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
bool verbose = true; // print model info on load
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
58
llama.cpp
58
llama.cpp
|
@ -472,9 +472,11 @@ struct llama_file_loader {
|
||||||
llama_hparams hparams;
|
llama_hparams hparams;
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map, bool verbose)
|
||||||
: file(fname, "rb") {
|
: file(fname, "rb") {
|
||||||
|
if (verbose) {
|
||||||
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
||||||
|
}
|
||||||
read_magic();
|
read_magic();
|
||||||
read_hparams();
|
read_hparams();
|
||||||
read_vocab();
|
read_vocab();
|
||||||
|
@ -662,13 +664,13 @@ struct llama_model_loader {
|
||||||
struct ggml_context * ggml_ctx = NULL;
|
struct ggml_context * ggml_ctx = NULL;
|
||||||
std::unique_ptr<llama_mmap> mapping;
|
std::unique_ptr<llama_mmap> mapping;
|
||||||
|
|
||||||
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only, bool verbose) {
|
||||||
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map, verbose);
|
||||||
file_loaders.emplace_back(first_file);
|
file_loaders.emplace_back(first_file);
|
||||||
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
||||||
for (uint32_t i = 1; i < n_parts; i++) {
|
for (uint32_t i = 1; i < n_parts; i++) {
|
||||||
std::string fname = fname_base + "." + std::to_string(i);
|
std::string fname = fname_base + "." + std::to_string(i);
|
||||||
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map, verbose);
|
||||||
file_loaders.emplace_back(ith_file);
|
file_loaders.emplace_back(ith_file);
|
||||||
if (ith_file->hparams != first_file->hparams) {
|
if (ith_file->hparams != first_file->hparams) {
|
||||||
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
||||||
|
@ -949,6 +951,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
|
/*.verbose =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -1055,11 +1058,12 @@ static void llama_model_load_internal(
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
bool vocab_only,
|
bool vocab_only,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data,
|
||||||
|
bool verbose) {
|
||||||
|
|
||||||
model.t_start_us = ggml_time_us();
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only, verbose));
|
||||||
|
|
||||||
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||||
|
@ -1087,7 +1091,7 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
|
|
||||||
{
|
if (verbose) {
|
||||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||||
|
@ -1127,7 +1131,9 @@ static void llama_model_load_internal(
|
||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
ml->calc_sizes(&ctx_size, &mmapped_size);
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
||||||
|
if (verbose) {
|
||||||
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
|
@ -1151,12 +1157,16 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
(void) main_gpu;
|
(void) main_gpu;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
if (verbose) {
|
||||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||||
|
}
|
||||||
ggml_cuda_set_main_device(main_gpu);
|
ggml_cuda_set_main_device(main_gpu);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
if (verbose) {
|
||||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||||
|
}
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||||
#else
|
#else
|
||||||
|
@ -1256,20 +1266,20 @@ static void llama_model_load_internal(
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state =
|
||||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||||
|
|
||||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
if (verbose) fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
|
||||||
(void) vram_scratch;
|
(void) vram_scratch;
|
||||||
(void) n_batch;
|
(void) n_batch;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
if (verbose) fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
ggml_cuda_set_scratch_size(0); // disable scratch
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
||||||
} else {
|
} else {
|
||||||
vram_scratch = n_batch * MB;
|
vram_scratch = n_batch * MB;
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
if (verbose) fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
||||||
__func__, vram_scratch / MB);
|
__func__, vram_scratch / MB);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1277,32 +1287,34 @@ static void llama_model_load_internal(
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
if (verbose) fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
if (verbose && n_gpu_layers > (int) hparams.n_layer) {
|
||||||
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
if (verbose) fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
if (verbose) fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
||||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
if (verbose) fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
if (verbose) fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
||||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (verbose) {
|
||||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1348,10 +1360,11 @@ static bool llama_model_load(
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
bool vocab_only,
|
bool vocab_only,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data,
|
||||||
|
bool verbose) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data, verbose);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "error loading model: %s\n", err.what());
|
fprintf(stderr, "error loading model: %s\n", err.what());
|
||||||
|
@ -2444,7 +2457,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false,
|
||||||
|
/*verbose*/ true));
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
@ -2656,7 +2670,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data, params.verbose)) {
|
||||||
delete model;
|
delete model;
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -2713,7 +2727,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (params.verbose) {
|
||||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
@ -2872,7 +2886,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
llama_buffer base_buf;
|
llama_buffer base_buf;
|
||||||
if (path_base_model) {
|
if (path_base_model) {
|
||||||
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false, /*verbose*/ true));
|
||||||
|
|
||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -100,6 +100,7 @@ extern "C" {
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
bool verbose; // show information on stderr on model load. This doesn't affect error messages.
|
||||||
};
|
};
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue