cancel optimization when specified number of epochs is completed

2023-09-22 21:00:46 +02:00 · 2023-09-22 21:00:46 +02:00 · da05205af6
commit da05205af6
parent 9145c87acc
6 changed files with 69 additions and 11 deletions
--- a/common/train.cpp
+++ b/common/train.cpp
@ -1044,6 +1044,7 @@ struct train_params_common get_default_train_params_common() {
    params.n_threads  =    6;
    params.n_batch    =    8;
    params.n_gradient_accumulation = 1;
    params.n_epochs   = -1;
    params.custom_n_ctx = false;
@ -1122,7 +1123,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
+    fprintf(stderr, "  --epochs N                 Maximum number epochs to process. (default %d)\n", params->n_epochs);
    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
@ -1131,6 +1132,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
    fprintf(stderr, "\n");
 }
@ -1296,6 +1298,12 @@ bool consume_common_train_arg(
            return true;
        }
        params->adam_eps_f = std::stof(argv[i]);
    } else if (arg == "--epochs") {
        if (++i >= argc) {
            *invalid_param = true;
            return true;
        }
        params->n_epochs = std::stoi(argv[i]);
    } else if (arg == "--adam-iter") {
        if (++i >= argc) {
            *invalid_param = true;
@ -1359,7 +1367,7 @@ void finish_processing_train_args(struct train_params_common * params) {
    }
 }
-void train_opt_callback(void * vdata, int accum_step, float * sched) {
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
    struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
    struct train_params_common     * params = data->params;
    struct train_state             * train  = data->train;
@ -1475,4 +1483,14 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
            data->samples_count);
        train->shuffle_next_sample = 0;
    }
    const bool last_epoch_reached = (params->n_epochs > 0 && train->train_epochs - data->first_epoch >= params->n_epochs);
    if (last_epoch_reached) {
        // allow optimization iteration at last epoch to be completed before canceling
        if (data->iter_at_last_epoch < 0) {
            data->iter_at_last_epoch = opt->iter;
        } else if (opt->iter > data->iter_at_last_epoch) {
            *cancel = true;
        }
    }
 }
--- a/common/train.h
+++ b/common/train.h
@ -43,6 +43,7 @@ struct train_params_common {
    int n_threads;
    int n_batch;
    int n_gradient_accumulation;
    int n_epochs;
    bool custom_n_ctx;
@ -101,6 +102,8 @@ struct train_opt_callback_data {
    struct ggml_tensor         * tokens_input;
    struct ggml_tensor         * target_probs;
    int                          first_iter;
    int                          first_epoch;
    int                          iter_at_last_epoch;
    int64_t                      last_time;
    double                       millis_per_iter;
 };
@ -224,4 +227,4 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
 std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-void train_opt_callback(void * vdata, int accum_step, float * sched);
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1881,6 +1881,8 @@ int main(int argc, char ** argv) {
    opt_cb_data.tokens_input           = tokens_input;
    opt_cb_data.target_probs           = target_probs;
    opt_cb_data.first_iter             = opt->iter;
    opt_cb_data.first_epoch            = train->train_epochs;
    opt_cb_data.iter_at_last_epoch     = -1;
    opt_cb_data.last_time              = ggml_time_ms();
    opt_cb_data.millis_per_iter        = 0.0;
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1244,6 +1244,8 @@ int main(int argc, char ** argv) {
    opt_cb_data.tokens_input           = tokens_input;
    opt_cb_data.target_probs           = target_probs;
    opt_cb_data.first_iter             = opt->iter;
    opt_cb_data.first_epoch            = train->train_epochs;
    opt_cb_data.iter_at_last_epoch     = -1;
    opt_cb_data.last_time              = ggml_time_ms();
    opt_cb_data.millis_per_iter        = 0.0;
--- a/ggml.c
+++ b/ggml.c
@ -19268,14 +19268,17 @@ static enum ggml_opt_result ggml_opt_adam(
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
    bool cancel = false;
    // compute the function value
    float fx = 0;
    ggml_set_zero(opt->adam.g);
    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
        if (callback) {
-            callback(callback_data, accum_step, &sched);
+            callback(callback_data, accum_step, &sched, &cancel);
            if (cancel) {
                break;
            }
        }
        // ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
@ -19283,6 +19286,9 @@ static enum ggml_opt_result ggml_opt_adam(
        ggml_opt_acc_grad(np, ps, g, accum_norm);
        fx += ggml_get_f32_1d(f, 0);
    }
    if (cancel) {
        return GGML_OPT_DID_NOT_CONVERGE;
    }
    fx *= accum_norm;
    opt->adam.fx_prev = fx;
@ -19308,6 +19314,9 @@ static enum ggml_opt_result ggml_opt_adam(
    // run the optimizer
    for (int t = 0; t < params.adam.n_iter; ++t) {
        if (cancel) {
            break;
        }
        opt->iter = iter0 + t + 1;
        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
@ -19363,7 +19372,10 @@ static enum ggml_opt_result ggml_opt_adam(
        ggml_set_zero(opt->adam.g);
        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
            if (callback) {
-                callback(callback_data, accum_step, &sched);
+                callback(callback_data, accum_step, &sched, &cancel);
                if (cancel) {
                    break;
                }
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
@ -19371,6 +19383,9 @@ static enum ggml_opt_result ggml_opt_adam(
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
        if (cancel) {
            break;
        }
        fx *= accum_norm;
        opt->loss_after = fx;
@ -19456,6 +19471,7 @@ static enum ggml_opt_result linesearch_backtracking(
        struct ggml_cplan  * cplan,
        const int np,
        struct ggml_tensor * ps[],
        bool * cancel,
        ggml_opt_callback callback,
        void * callback_data) {
    int count = 0;
@ -19488,7 +19504,7 @@ static enum ggml_opt_result linesearch_backtracking(
    finit = *fx;
    dgtest = params->lbfgs.ftol*dginit;
-    while (true) {
+    while (!*cancel) {
        ggml_vec_cpy_f32(nx, x, xp);
        ggml_vec_mad_f32(nx, x, d, *step);
@ -19502,7 +19518,10 @@ static enum ggml_opt_result linesearch_backtracking(
                if (callback) {
                    // LBFG-S does not support learning rate -> ignore learning schedule
                    float sched = 0;
-                    callback(callback_data, accum_step, &sched);
+                    callback(callback_data, accum_step, &sched, cancel);
                    if (*cancel) {
                        break;
                    }
                }
                // ggml_graph_reset  (gf);
                ggml_set_f32      (f->grad, 1.0f);
@ -19510,6 +19529,9 @@ static enum ggml_opt_result linesearch_backtracking(
                ggml_opt_acc_grad(np, ps, g, accum_norm);
                *fx += ggml_get_f32_1d(f, 0);
            }
            if (*cancel) {
                break;
            }
            *fx *= accum_norm;
        }
@ -19628,6 +19650,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
    float * lm_s     = opt->lbfgs.lms->data;
    float * lm_y     = opt->lbfgs.lmy->data;
    bool cancel = false;
    // evaluate the function value and its gradient
    {
        ggml_opt_set_params(np, ps, x);
@ -19638,7 +19662,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            if (callback) {
                // LBFG-S does not support learning rate -> ignore learning schedule
                float sched = 0;
-                callback(callback_data, accum_step, &sched);
+                callback(callback_data, accum_step, &sched, &cancel);
                if (cancel) {
                    break;
                }
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
@ -19646,6 +19673,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
        if (cancel) {
            return GGML_OPT_DID_NOT_CONVERGE;
        }
        fx *= accum_norm;
        opt->loss_before = fx;
@ -19704,7 +19734,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        ggml_vec_cpy_f32(nx, xp, x);
        ggml_vec_cpy_f32(nx, gp, g);
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
        if (!cancel) {
            break;
        }
        if (ls < 0) {
            // linesearch failed - go back to the previous point and return
--- a/ggml.h
+++ b/ggml.h
@ -1726,7 +1726,7 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    // optimization parameters
    //