llama : add new llama_perf API
ggml-ci
This commit is contained in:
parent
5ab52c1f64
commit
757a9bf868
25 changed files with 149 additions and 101 deletions
|
@ -2533,7 +2533,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
}
|
}
|
||||||
llama_kv_cache_clear(lctx);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_synchronize(lctx);
|
llama_synchronize(lctx);
|
||||||
llama_reset_timings(lctx, nullptr);
|
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model = model;
|
||||||
|
|
|
@ -153,7 +153,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_timing = false;
|
lparams.no_perf = false; // TODO: control via params
|
||||||
|
|
||||||
auto * result = new gpt_sampler {
|
auto * result = new gpt_sampler {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
|
@ -270,8 +270,15 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||||
return gsmpl->prev.rat(0);
|
return gsmpl->prev.rat(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
||||||
llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr);
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
|
if (gsmpl) {
|
||||||
|
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
}
|
||||||
|
if (ctx) {
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||||
|
|
|
@ -62,6 +62,8 @@ struct gpt_sampler_params {
|
||||||
// - grammar support
|
// - grammar support
|
||||||
// - custom sampler logic based on the parameters
|
// - custom sampler logic based on the parameters
|
||||||
//
|
//
|
||||||
|
// TODO: measure grammar performance
|
||||||
|
//
|
||||||
struct gpt_sampler;
|
struct gpt_sampler;
|
||||||
|
|
||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
||||||
|
@ -75,11 +77,9 @@ void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
|
|
||||||
|
|
||||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
|
|
|
@ -210,7 +210,8 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
|
|
@ -231,7 +231,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx, smpl);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -307,8 +307,10 @@ int main(int argc, char ** argv) {
|
||||||
if (notArray) fprintf(stdout, "\n}\n");
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx, nullptr);
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -181,7 +181,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -171,8 +171,12 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
auto sparams = llama_sampler_chain_default_params();
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
sparams.no_perf = false;
|
||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
||||||
{
|
{
|
||||||
|
|
|
@ -638,7 +638,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -81,7 +81,7 @@ static void write_logfile(
|
||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ static void sigint_handler(int signo) {
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
printf("\n");
|
||||||
gpt_print_timings(*g_ctx, *g_smpl);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
|
@ -634,7 +634,8 @@ int main(int argc, char ** argv) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_print_timings(ctx, smpl);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
|
@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
|
||||||
fflush(p_err->fout);
|
fflush(p_err->fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
|
|
@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama, nullptr);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama, nullptr);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
|
|
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(ctx_llava->ctx_llama, nullptr);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
|
|
@ -467,7 +467,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
gpt_print_timings(ctx, smpl);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
|
|
|
@ -238,8 +238,9 @@ int main(int argc, char ** argv){
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
gpt_print_timings(ctx, smpl);
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,7 @@ static void write_logfile(
|
||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +106,7 @@ static void sigint_handler(int signo) {
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
printf("\n");
|
||||||
gpt_print_timings(*g_ctx, *g_smpl);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
|
@ -929,7 +929,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_print_timings(ctx, smpl);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
|
|
|
@ -414,7 +414,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
// TODO: print sampling/grammar timings for all clients
|
// TODO: print sampling/grammar timings for all clients
|
||||||
llama_print_timings(ctx, nullptr);
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
|
|
@ -259,7 +259,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ static void write_logfile(
|
||||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||||
yaml_dump_vector_float(logfile, "probs", results.probs);
|
yaml_dump_vector_float(logfile, "probs", results.probs);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2048,7 +2048,8 @@ int main(int argc, char ** argv) {
|
||||||
results = perplexity(ctx, params, n_ctx);
|
results = perplexity(ctx, params, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
write_logfile(ctx, params, model, results);
|
write_logfile(ctx, params, model, results);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
|
@ -293,8 +293,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx, nullptr);
|
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -57,8 +57,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
auto sparams = llama_sampler_chain_default_params();
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
sparams.no_perf = false;
|
||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
|
@ -153,7 +157,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx, nullptr);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -613,12 +613,12 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ndraft:\n");
|
LOG_TEE("\ndraft:\n\n");
|
||||||
// TODO: print sampling/grammar timings for all drafts
|
// TODO: print sampling/grammar timings for all drafts
|
||||||
gpt_print_timings(ctx_dft, nullptr);
|
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
gpt_print_timings(ctx_tgt, smpl);
|
gpt_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
|
|
@ -342,6 +342,7 @@ extern "C" {
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
//bool no_perf; // whether to measure performance timings, TODO: implement
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
|
@ -371,23 +372,9 @@ extern "C" {
|
||||||
} llama_logit_bias;
|
} llama_logit_bias;
|
||||||
|
|
||||||
typedef struct llama_sampler_chain_params {
|
typedef struct llama_sampler_chain_params {
|
||||||
bool no_timing; // whether to measure performance timings
|
bool no_perf; // whether to measure performance timings
|
||||||
} llama_sampler_chain_params;
|
} llama_sampler_chain_params;
|
||||||
|
|
||||||
// performance timing information
|
|
||||||
struct llama_timings {
|
|
||||||
double t_start_ms;
|
|
||||||
double t_end_ms;
|
|
||||||
double t_load_ms;
|
|
||||||
double t_sampler_ms;
|
|
||||||
double t_p_eval_ms;
|
|
||||||
double t_eval_ms;
|
|
||||||
|
|
||||||
int32_t n_sampler;
|
|
||||||
int32_t n_p_eval;
|
|
||||||
int32_t n_eval;
|
|
||||||
};
|
|
||||||
|
|
||||||
// used in chat template
|
// used in chat template
|
||||||
typedef struct llama_chat_message {
|
typedef struct llama_chat_message {
|
||||||
const char * role;
|
const char * role;
|
||||||
|
@ -1121,13 +1108,6 @@ extern "C" {
|
||||||
// Returns the split_prefix length.
|
// Returns the split_prefix length.
|
||||||
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||||
|
|
||||||
// Performance information
|
|
||||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// note: requires llama_sampler_chain. how to prevent misuse?
|
|
||||||
LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain);
|
|
||||||
LLAMA_API void llama_reset_timings( struct llama_context * ctx, struct llama_sampler * chain);
|
|
||||||
|
|
||||||
// Print system information
|
// Print system information
|
||||||
LLAMA_API const char * llama_print_system_info(void);
|
LLAMA_API const char * llama_print_system_info(void);
|
||||||
|
|
||||||
|
@ -1135,7 +1115,21 @@ extern "C" {
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
//
|
||||||
|
// Performance utils
|
||||||
|
//
|
||||||
|
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||||
|
//
|
||||||
|
|
||||||
|
enum llama_perf_type {
|
||||||
|
LLAMA_PERF_TYPE_CONTEXT = 0,
|
||||||
|
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
||||||
|
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
||||||
|
|
||||||
|
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -511,7 +511,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
|
||||||
/* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
|
/* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
|
||||||
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
time_meas tm(chain->t_sample_us, chain->params.no_timing);
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * smpl : chain->samplers) {
|
||||||
llama_sampler_accept(smpl, token);
|
llama_sampler_accept(smpl, token);
|
||||||
|
@ -522,7 +522,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
|
||||||
/* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
/* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
time_meas tm(chain->t_sample_us, chain->params.no_timing);
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * smpl : chain->samplers) {
|
||||||
llama_sampler_apply(smpl, cur_p);
|
llama_sampler_apply(smpl, cur_p);
|
||||||
|
|
104
src/llama.cpp
104
src/llama.cpp
|
@ -17924,7 +17924,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_timing =*/ false, // TODO: change to true and set explicitly in examples
|
/*.no_perf =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -20650,45 +20650,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) {
|
|
||||||
auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr;
|
|
||||||
|
|
||||||
const llama_timings timings = {
|
|
||||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
|
||||||
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
|
||||||
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
|
||||||
/*.t_sampler_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us : 0.0),
|
|
||||||
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
|
||||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
|
||||||
|
|
||||||
/*.n_sampler =*/ std::max(0, smpl ? smpl->n_sample : 0),
|
|
||||||
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
|
||||||
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
|
||||||
};
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
|
||||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
|
||||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, timings.t_sampler_ms, timings.n_sampler, timings.t_sampler_ms / timings.n_sampler, 1e3 / timings.t_sampler_ms * timings.n_sampler);
|
|
||||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
|
||||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
|
||||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) {
|
|
||||||
ctx->t_start_us = ggml_time_us();
|
|
||||||
ctx->t_eval_us = ctx->n_eval = 0;
|
|
||||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
|
||||||
|
|
||||||
if (chain) {
|
|
||||||
auto * smpl = (struct llama_sampler_chain *) chain->ctx;
|
|
||||||
|
|
||||||
smpl->t_sample_us = smpl->n_sample = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * llama_print_system_info(void) {
|
const char * llama_print_system_info(void) {
|
||||||
static std::string s;
|
static std::string s;
|
||||||
|
|
||||||
|
@ -20717,7 +20678,68 @@ const char * llama_print_system_info(void) {
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case LLAMA_PERF_TYPE_CONTEXT:
|
||||||
|
{
|
||||||
|
const auto * p = (const struct llama_context *) ctx;
|
||||||
|
|
||||||
|
const double t_start_ms = 1e-3 * p->t_start_us;
|
||||||
|
const double t_end_ms = 1.00 * ggml_time_ms();
|
||||||
|
const double t_load_ms = 1e-3 * p->t_load_us;
|
||||||
|
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
||||||
|
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
||||||
|
|
||||||
|
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
||||||
|
const int32_t n_eval = std::max(1, p->n_eval);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
||||||
|
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
||||||
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
||||||
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
||||||
|
} break;
|
||||||
|
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||||
|
{
|
||||||
|
const auto * smpl = (const struct llama_sampler *) ctx;
|
||||||
|
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
|
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
||||||
|
|
||||||
|
const int32_t n_sampler = std::max(0, p->n_sample);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("invalid perf type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case LLAMA_PERF_TYPE_CONTEXT:
|
||||||
|
{
|
||||||
|
auto * p = (struct llama_context *) ctx;
|
||||||
|
|
||||||
|
p->t_start_us = ggml_time_us();
|
||||||
|
p->t_eval_us = p->n_eval = 0;
|
||||||
|
p->t_p_eval_us = p->n_p_eval = 0;
|
||||||
|
} break;
|
||||||
|
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
||||||
|
{
|
||||||
|
auto * smpl = (struct llama_sampler *) ctx;
|
||||||
|
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
|
p->t_sample_us = p->n_sample = 0;
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("invalid perf type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||||
fprintf(stream, "\n");
|
fprintf(stream, "\n");
|
||||||
fprintf(stream, "###########\n");
|
fprintf(stream, "###########\n");
|
||||||
fprintf(stream, "# Timings #\n");
|
fprintf(stream, "# Timings #\n");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue