llama-bench : keep the same model between tests when possible
This commit is contained in:
parent
f28e4953a8
commit
96f6dcdeae
2 changed files with 78 additions and 9 deletions
|
@ -367,6 +367,13 @@ struct cmd_params_instance {
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool equal_mparams(const cmd_params_instance & other) const {
|
||||||
|
return n_gpu_layers == other.n_gpu_layers &&
|
||||||
|
main_gpu == other.main_gpu &&
|
||||||
|
low_vram == other.low_vram &&
|
||||||
|
tensor_split == other.tensor_split;
|
||||||
|
}
|
||||||
|
|
||||||
llama_context_params to_llama_cparams() const {
|
llama_context_params to_llama_cparams() const {
|
||||||
llama_context_params cparams = llama_context_default_params();
|
llama_context_params cparams = llama_context_default_params();
|
||||||
|
|
||||||
|
@ -384,13 +391,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
std::vector<cmd_params_instance> instances;
|
std::vector<cmd_params_instance> instances;
|
||||||
|
|
||||||
for (const auto & m : params.model)
|
for (const auto & m : params.model)
|
||||||
for (const auto & nb : params.n_batch)
|
|
||||||
for (const auto & fk : params.f32_kv)
|
|
||||||
for (const auto & nl : params.n_gpu_layers)
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
|
||||||
for (const auto & lv : params.low_vram)
|
for (const auto & lv : params.low_vram)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
|
for (const auto & nb : params.n_batch)
|
||||||
|
for (const auto & fk : params.f32_kv)
|
||||||
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
cmd_params_instance instance = {
|
cmd_params_instance instance = {
|
||||||
/* .model = */ m,
|
/* .model = */ m,
|
||||||
|
@ -413,6 +420,53 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
|
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
|
||||||
std::vector<cmd_params_instance> instances;
|
std::vector<cmd_params_instance> instances;
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
||||||
|
for (const auto & m : params.model)
|
||||||
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
|
for (const auto & mg : params.main_gpu)
|
||||||
|
for (const auto & lv : params.low_vram)
|
||||||
|
for (const auto & ts : params.tensor_split)
|
||||||
|
for (const auto & nb : params.n_batch)
|
||||||
|
for (const auto & fk : params.f32_kv)
|
||||||
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
|
for (const auto & nt : params.n_threads) {
|
||||||
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
|
cmd_params_instance instance = {
|
||||||
|
/* .model = */ m,
|
||||||
|
/* .n_prompt = */ n_prompt,
|
||||||
|
/* .n_gen = */ 0,
|
||||||
|
/* .n_batch = */ nb,
|
||||||
|
/* .f32_kv = */ fk,
|
||||||
|
/* .n_threads = */ nt,
|
||||||
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .main_gpu = */ mg,
|
||||||
|
/* .mul_mat_q = */ mmq,
|
||||||
|
/* .low_vram = */ lv,
|
||||||
|
/* .tensor_split = */ ts,
|
||||||
|
};
|
||||||
|
instances.push_back(instance);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & n_gen : params.n_gen) {
|
||||||
|
cmd_params_instance instance = {
|
||||||
|
/* .model = */ m,
|
||||||
|
/* .n_prompt = */ 0,
|
||||||
|
/* .n_gen = */ n_gen,
|
||||||
|
/* .n_batch = */ nb,
|
||||||
|
/* .f32_kv = */ fk,
|
||||||
|
/* .n_threads = */ nt,
|
||||||
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .main_gpu = */ mg,
|
||||||
|
/* .mul_mat_q = */ mmq,
|
||||||
|
/* .low_vram = */ lv,
|
||||||
|
/* .tensor_split = */ ts,
|
||||||
|
};
|
||||||
|
instances.push_back(instance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// this ordering separates the prompt and generation tests
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -428,6 +482,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
|
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
|
||||||
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
|
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return instances;
|
return instances;
|
||||||
}
|
}
|
||||||
|
@ -967,12 +1022,22 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
||||||
|
|
||||||
|
llama_model * lmodel = nullptr;
|
||||||
|
const cmd_params_instance * prev_inst = nullptr;
|
||||||
|
|
||||||
for (const auto & inst : params_instances) {
|
for (const auto & inst : params_instances) {
|
||||||
// TODO: keep the model between tests when possible
|
// keep the same model between tests when possible
|
||||||
llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
if (lmodel == NULL) {
|
if (lmodel) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
llama_free_model(lmodel);
|
||||||
return 1;
|
}
|
||||||
|
|
||||||
|
lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
||||||
|
if (lmodel == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
prev_inst = &inst;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
||||||
|
@ -1009,9 +1074,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(lmodel);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_free_model(lmodel);
|
||||||
|
|
||||||
p->print_footer();
|
p->print_footer();
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
|
@ -7075,6 +7075,9 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||||
|
if (scratch_size > g_scratch_size) {
|
||||||
|
ggml_cuda_free_scratch();
|
||||||
|
}
|
||||||
g_scratch_size = scratch_size;
|
g_scratch_size = scratch_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue