cancel optimization when specified number of epochs is completed
This commit is contained in:
parent
9145c87acc
commit
da05205af6
6 changed files with 69 additions and 11 deletions
|
@ -1044,6 +1044,7 @@ struct train_params_common get_default_train_params_common() {
|
|||
params.n_threads = 6;
|
||||
params.n_batch = 8;
|
||||
params.n_gradient_accumulation = 1;
|
||||
params.n_epochs = -1;
|
||||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
|
@ -1122,7 +1123,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
|
|||
fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
|
||||
fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
|
||||
fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
|
||||
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
||||
fprintf(stderr, " --epochs N Maximum number epochs to process. (default %d)\n", params->n_epochs);
|
||||
fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
|
||||
fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha);
|
||||
fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
|
||||
|
@ -1131,6 +1132,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
|
|||
fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
|
||||
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
|
||||
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
|
||||
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
@ -1296,6 +1298,12 @@ bool consume_common_train_arg(
|
|||
return true;
|
||||
}
|
||||
params->adam_eps_f = std::stof(argv[i]);
|
||||
} else if (arg == "--epochs") {
|
||||
if (++i >= argc) {
|
||||
*invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params->n_epochs = std::stoi(argv[i]);
|
||||
} else if (arg == "--adam-iter") {
|
||||
if (++i >= argc) {
|
||||
*invalid_param = true;
|
||||
|
@ -1359,7 +1367,7 @@ void finish_processing_train_args(struct train_params_common * params) {
|
|||
}
|
||||
}
|
||||
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched) {
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
|
||||
struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata;
|
||||
struct train_params_common * params = data->params;
|
||||
struct train_state * train = data->train;
|
||||
|
@ -1475,4 +1483,14 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
|
|||
data->samples_count);
|
||||
train->shuffle_next_sample = 0;
|
||||
}
|
||||
|
||||
const bool last_epoch_reached = (params->n_epochs > 0 && train->train_epochs - data->first_epoch >= params->n_epochs);
|
||||
if (last_epoch_reached) {
|
||||
// allow optimization iteration at last epoch to be completed before canceling
|
||||
if (data->iter_at_last_epoch < 0) {
|
||||
data->iter_at_last_epoch = opt->iter;
|
||||
} else if (opt->iter > data->iter_at_last_epoch) {
|
||||
*cancel = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ struct train_params_common {
|
|||
int n_threads;
|
||||
int n_batch;
|
||||
int n_gradient_accumulation;
|
||||
int n_epochs;
|
||||
|
||||
bool custom_n_ctx;
|
||||
|
||||
|
@ -101,6 +102,8 @@ struct train_opt_callback_data {
|
|||
struct ggml_tensor * tokens_input;
|
||||
struct ggml_tensor * target_probs;
|
||||
int first_iter;
|
||||
int first_epoch;
|
||||
int iter_at_last_epoch;
|
||||
int64_t last_time;
|
||||
double millis_per_iter;
|
||||
};
|
||||
|
@ -224,4 +227,4 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
|
|||
|
||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched);
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
||||
|
|
|
@ -1881,6 +1881,8 @@ int main(int argc, char ** argv) {
|
|||
opt_cb_data.tokens_input = tokens_input;
|
||||
opt_cb_data.target_probs = target_probs;
|
||||
opt_cb_data.first_iter = opt->iter;
|
||||
opt_cb_data.first_epoch = train->train_epochs;
|
||||
opt_cb_data.iter_at_last_epoch = -1;
|
||||
opt_cb_data.last_time = ggml_time_ms();
|
||||
opt_cb_data.millis_per_iter = 0.0;
|
||||
|
||||
|
|
|
@ -1244,6 +1244,8 @@ int main(int argc, char ** argv) {
|
|||
opt_cb_data.tokens_input = tokens_input;
|
||||
opt_cb_data.target_probs = target_probs;
|
||||
opt_cb_data.first_iter = opt->iter;
|
||||
opt_cb_data.first_epoch = train->train_epochs;
|
||||
opt_cb_data.iter_at_last_epoch = -1;
|
||||
opt_cb_data.last_time = ggml_time_ms();
|
||||
opt_cb_data.millis_per_iter = 0.0;
|
||||
|
||||
|
|
47
ggml.c
47
ggml.c
|
@ -19268,14 +19268,17 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||
|
||||
bool cancel = false;
|
||||
|
||||
// compute the function value
|
||||
|
||||
float fx = 0;
|
||||
ggml_set_zero(opt->adam.g);
|
||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||
if (callback) {
|
||||
callback(callback_data, accum_step, &sched);
|
||||
callback(callback_data, accum_step, &sched, &cancel);
|
||||
if (cancel) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
@ -19283,6 +19286,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||
fx += ggml_get_f32_1d(f, 0);
|
||||
}
|
||||
if (cancel) {
|
||||
return GGML_OPT_DID_NOT_CONVERGE;
|
||||
}
|
||||
fx *= accum_norm;
|
||||
|
||||
opt->adam.fx_prev = fx;
|
||||
|
@ -19308,6 +19314,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
|
||||
// run the optimizer
|
||||
for (int t = 0; t < params.adam.n_iter; ++t) {
|
||||
if (cancel) {
|
||||
break;
|
||||
}
|
||||
opt->iter = iter0 + t + 1;
|
||||
GGML_PRINT_DEBUG ("=== iter %d ===\n", t);
|
||||
|
||||
|
@ -19363,7 +19372,10 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
ggml_set_zero(opt->adam.g);
|
||||
for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
|
||||
if (callback) {
|
||||
callback(callback_data, accum_step, &sched);
|
||||
callback(callback_data, accum_step, &sched, &cancel);
|
||||
if (cancel) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
@ -19371,6 +19383,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||
fx += ggml_get_f32_1d(f, 0);
|
||||
}
|
||||
if (cancel) {
|
||||
break;
|
||||
}
|
||||
fx *= accum_norm;
|
||||
|
||||
opt->loss_after = fx;
|
||||
|
@ -19456,6 +19471,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
struct ggml_cplan * cplan,
|
||||
const int np,
|
||||
struct ggml_tensor * ps[],
|
||||
bool * cancel,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data) {
|
||||
int count = 0;
|
||||
|
@ -19488,7 +19504,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
finit = *fx;
|
||||
dgtest = params->lbfgs.ftol*dginit;
|
||||
|
||||
while (true) {
|
||||
while (!*cancel) {
|
||||
ggml_vec_cpy_f32(nx, x, xp);
|
||||
ggml_vec_mad_f32(nx, x, d, *step);
|
||||
|
||||
|
@ -19502,7 +19518,10 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
if (callback) {
|
||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||
float sched = 0;
|
||||
callback(callback_data, accum_step, &sched);
|
||||
callback(callback_data, accum_step, &sched, cancel);
|
||||
if (*cancel) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
@ -19510,6 +19529,9 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||
*fx += ggml_get_f32_1d(f, 0);
|
||||
}
|
||||
if (*cancel) {
|
||||
break;
|
||||
}
|
||||
*fx *= accum_norm;
|
||||
|
||||
}
|
||||
|
@ -19628,6 +19650,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
float * lm_s = opt->lbfgs.lms->data;
|
||||
float * lm_y = opt->lbfgs.lmy->data;
|
||||
|
||||
bool cancel = false;
|
||||
|
||||
// evaluate the function value and its gradient
|
||||
{
|
||||
ggml_opt_set_params(np, ps, x);
|
||||
|
@ -19638,7 +19662,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
if (callback) {
|
||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||
float sched = 0;
|
||||
callback(callback_data, accum_step, &sched);
|
||||
callback(callback_data, accum_step, &sched, &cancel);
|
||||
if (cancel) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
@ -19646,6 +19673,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||
fx += ggml_get_f32_1d(f, 0);
|
||||
}
|
||||
if (cancel) {
|
||||
return GGML_OPT_DID_NOT_CONVERGE;
|
||||
}
|
||||
fx *= accum_norm;
|
||||
|
||||
opt->loss_before = fx;
|
||||
|
@ -19704,7 +19734,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
ggml_vec_cpy_f32(nx, xp, x);
|
||||
ggml_vec_cpy_f32(nx, gp, g);
|
||||
|
||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, callback, callback_data);
|
||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
||||
if (!cancel) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ls < 0) {
|
||||
// linesearch failed - go back to the previous point and return
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -1726,7 +1726,7 @@ extern "C" {
|
|||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||
};
|
||||
|
||||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched);
|
||||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue