improve finetune time measurement

fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
This commit is contained in:
xaedes 2023-09-06 18:06:24 +02:00
parent 867e7c2255
commit 8c2d7e37f9
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1

View file

@ -2283,12 +2283,12 @@ struct opt_callback_data {
struct ggml_tensor * target_probs; struct ggml_tensor * target_probs;
int first_iter; int first_iter;
int64_t last_time; int64_t last_time;
float time_per_iter; double millis_per_iter;
}; };
void print_duration(float fmillis) { void print_duration(double fmillis) {
if (fmillis < 1000.0f) { if (fmillis < 1000.0f) {
printf("%.1fms", fmillis); printf("%.1fms", (float) fmillis);
return; return;
} }
const int64_t one_sec = 1000; const int64_t one_sec = 1000;
@ -2296,16 +2296,17 @@ void print_duration(float fmillis) {
const int64_t one_hour = one_min * 60; const int64_t one_hour = one_min * 60;
const int64_t one_day = one_hour * 24; const int64_t one_day = one_hour * 24;
int64_t millis = fmillis; int64_t millis = (int64_t) fmillis;
int64_t days = millis/one_day; int64_t days = millis/one_day;
int64_t hours = (millis - days*one_day)/one_hour; int64_t hours = (millis - days*one_day)/one_hour;
int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
// to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
if (days > 0) { if (days > 0) {
printf("%lldd ", days); printf("%lldd ", (long long int) days);
} }
printf("%02lld:%02lld:%02lld", hours, minutes, seconds); printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
} }
void opt_callback(void * vdata, int accum_step, float * sched) { void opt_callback(void * vdata, int accum_step, float * sched) {
@ -2316,25 +2317,27 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
int n_ctx = params->n_ctx; int n_ctx = params->n_ctx;
if (accum_step == 0) { if (accum_step == 0) {
// time measurement
int64_t now = ggml_time_ms(); int64_t now = ggml_time_ms();
if (now > data->last_time) { if (now > data->last_time && opt->iter > data->first_iter) {
float dt = now - data->last_time; double dt = now - data->last_time;
if (data->time_per_iter == 0) { if (data->millis_per_iter == 0.0) {
data->time_per_iter = dt; data->millis_per_iter = dt;
} else { } else {
const float gain = 0.7f; const double gain = 0.7;
data->time_per_iter = data->time_per_iter*(1.0f-gain) + dt*gain; data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
} }
} }
data->last_time = now;
float remaining_time = 0; double remaining_millis = 0.0;
if (data->time_per_iter > 0) { if (data->millis_per_iter > 0.0) {
const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter; const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
const int done_iter = opt->iter - data->first_iter; const int done_iter = opt->iter - data->first_iter;
const int remaining_iter = n_iter - done_iter; const int remaining_iter = n_iter - done_iter;
remaining_time = remaining_iter * data->time_per_iter; remaining_millis = remaining_iter * data->millis_per_iter;
} }
// file saving
const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every); const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
if (save_now) { if (save_now) {
int new_iters = opt->iter - data->last_save_iter; int new_iters = opt->iter - data->last_save_iter;
@ -2353,6 +2356,9 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
data->last_save_iter = opt->iter; data->last_save_iter = opt->iter;
} }
// exclude file saving from time measurement, by measuring last_time after saving
data->last_time = ggml_time_ms();
*sched = (opt->iter < params->warmup) *sched = (opt->iter < params->warmup)
? (float) opt->iter / (float) params->warmup ? (float) opt->iter / (float) params->warmup
: cosine_decay_restart( : cosine_decay_restart(
@ -2369,11 +2375,13 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0; if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
printf("%s: iter=%*d sched=%f loss=%f", printf("%s: iter=%*d sched=%f loss=%f",
__func__, 6, opt->iter, *sched, opt->loss_after); __func__, 6, opt->iter, *sched, opt->loss_after);
if (data->time_per_iter > 0) {
if (data->millis_per_iter > 0) {
printf(" dt="); printf(" dt=");
print_duration(data->time_per_iter); print_duration(data->millis_per_iter);
printf(" eta="); printf(" eta=");
print_duration(remaining_time); print_duration(remaining_millis);
} }
float improvement = opt->loss_before - opt->loss_after; float improvement = opt->loss_before - opt->loss_after;
@ -2747,7 +2755,7 @@ int main(int argc, char ** argv) {
opt_cb_data.target_probs = target_probs; opt_cb_data.target_probs = target_probs;
opt_cb_data.first_iter = opt->iter; opt_cb_data.first_iter = opt->iter;
opt_cb_data.last_time = ggml_time_ms(); opt_cb_data.last_time = ggml_time_ms();
opt_cb_data.time_per_iter = 0; opt_cb_data.millis_per_iter = 0.0;
// measure required memory for work buffer // measure required memory for work buffer
size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE; size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;
@ -2770,10 +2778,8 @@ int main(int argc, char ** argv) {
ggml_free(ctx_input); ggml_free(ctx_input);
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
int64_t d = t1-t0; printf("%s: total training time: ", __func__);
float fd = (float) d * 1e-3; print_duration((double) (t1 - t0));
printf("%s: total training ", __func__);
print_duration(fd);
printf("\n"); printf("\n");
int new_iters = opt->iter - opt_cb_data.last_save_iter; int new_iters = opt->iter - opt_cb_data.last_save_iter;