train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'

2023-09-17 17:08:36 +02:00 · 2023-09-17 17:08:36 +02:00 · 9e10fa977e
commit 9e10fa977e
parent dd94ce4ec0
2 changed files with 90 additions and 51 deletions
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1658,8 +1658,8 @@ int main(int argc, char ** argv) {
    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
-    printf("%s: max_opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
+    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
+    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
    opt->iter = train->train_its;

    if (params.only_write_lora) {
@ -1686,7 +1686,7 @@ int main(int argc, char ** argv) {

    printf("%s: opt iter %d\n", __func__, opt->iter);

-    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
+    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size());

    std::vector<uint8_t> mem_input_data;
    std::vector<uint8_t> mem_compute_data;
@ -1709,7 +1709,7 @@ int main(int argc, char ** argv) {
    ggml_allocr_alloc(alloc, target_probs);
    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
    ggml_allocr_free(alloc);
-    printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));

    // allocate input tensors
    mem_input_data.resize(max_input_size);
@ -1769,7 +1769,7 @@ int main(int argc, char ** argv) {
        ggml_free(ctx_compute);
    }
    size_t max_compute_size = best_compute_size;
-    printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
+    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
    printf("%s: evaluation order = %s\n", __func__,
        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
@ -1887,7 +1887,7 @@ int main(int argc, char ** argv) {

    // measure required memory for work buffer
    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
+    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));

    // context for work buffer
    struct ggml_init_params ctx_work_params = {
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -19,6 +19,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static const size_t tensor_alignment = 32;
+
 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;
@ -56,6 +58,7 @@ struct my_llama_layer {

 struct my_llama_model {
    struct ggml_context * ctx = NULL;
+    std::vector<uint8_t> data;

    my_llama_hparams hparams;

@ -118,6 +121,65 @@ static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }

+static void set_param_model(struct my_llama_model * model) {
+    const auto& hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct ggml_context* ctx = model->ctx;
+
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+
+static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
+    ggml_allocr_alloc(alloc, model->tok_embeddings);
+    ggml_allocr_alloc(alloc, model->norm);
+    ggml_allocr_alloc(alloc, model->output);
+    for (uint32_t i = 0; i < model->layers.size(); ++i) {
+        auto & layer = model->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm);
+        ggml_allocr_alloc(alloc, layer.wq);
+        ggml_allocr_alloc(alloc, layer.wk);
+        ggml_allocr_alloc(alloc, layer.wv);
+        ggml_allocr_alloc(alloc, layer.wo);
+        ggml_allocr_alloc(alloc, layer.ffn_norm);
+        ggml_allocr_alloc(alloc, layer.w1);
+        ggml_allocr_alloc(alloc, layer.w2);
+        ggml_allocr_alloc(alloc, layer.w3);
+    }
+    ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
+    ggml_allocr_alloc(alloc, model->norm->grad);
+    ggml_allocr_alloc(alloc, model->output->grad);
+    for (uint32_t i = 0; i < model->layers.size(); ++i) {
+        auto & layer = model->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm->grad);
+        ggml_allocr_alloc(alloc, layer.wq->grad);
+        ggml_allocr_alloc(alloc, layer.wk->grad);
+        ggml_allocr_alloc(alloc, layer.wv->grad);
+        ggml_allocr_alloc(alloc, layer.wo->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
+        ggml_allocr_alloc(alloc, layer.w1->grad);
+        ggml_allocr_alloc(alloc, layer.w2->grad);
+        ggml_allocr_alloc(alloc, layer.w3->grad);
+    }
+}
+
 static void init_model(struct my_llama_model * model) {
    const auto & hparams = model->hparams;

@ -126,7 +188,6 @@ static void init_model(struct my_llama_model * model) {
    const uint32_t n_vocab = hparams.n_vocab;
    const uint32_t n_ff    = hparams.n_ff;

-    struct ggml_context * ctx = model->ctx;

    std::vector<char> tn_buf;
    tn_buf.resize(GGML_MAX_NAME);
@ -141,6 +202,15 @@ static void init_model(struct my_llama_model * model) {
        return tn_buf.data();
    };

+    // context for model tensors without their data
+    struct ggml_init_params ctx_model_params;
+    ctx_model_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
+    ctx_model_params.mem_buffer = NULL;
+    ctx_model_params.no_alloc   = true;
+
+    struct ggml_context * ctx = ggml_init(ctx_model_params);
+    model->ctx = ctx;
+
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
@ -179,32 +249,20 @@ static void init_model(struct my_llama_model * model) {
        ggml_set_name(layer.w2,             tni(LLM_TENSOR_FFN_DOWN, i));
        ggml_set_name(layer.w3,             tni(LLM_TENSOR_FFN_UP, i));
    }
-}

-static void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
+    set_param_model(model);

-    const uint32_t n_layer = hparams.n_layer;
+    // measure data size
+    struct ggml_allocr * alloc = NULL;
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    alloc_model(alloc, model);

-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
+    // allocate data
+    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
+    ggml_allocr_free(alloc);
+    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
+    alloc_model(alloc, model);
+    ggml_allocr_free(alloc);
 }

 static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@ -720,7 +778,6 @@ struct train_params {

    bool use_alloc;

-    int mem_model_gb;
    int mem_compute_gb;
    int mem_compute0_gb;
 };
@ -747,7 +804,6 @@ struct train_params get_default_train_params() {

    params.use_alloc              = true;

-    params.mem_model_gb   =  2;
    params.mem_compute_gb = 24;
    params.mem_compute0_gb = 8;
    return params;
@ -772,7 +828,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
-    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);

@ -868,12 +923,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            params->use_alloc = false;
        } else if (arg == "--use-alloc") {
            params->use_alloc = true;
-        } else if (arg == "--mem-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_model_gb = std::stoi(argv[i]);
        } else if (arg == "--mem-compute") {
            if (++i >= argc) {
                invalid_param = true;
@ -960,13 +1009,6 @@ int main(int argc, char ** argv) {

    print_params(&model.hparams);

-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    model.ctx = ggml_init(lcparams);
-
    int n_tokens = model.hparams.n_ctx;
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;
@ -992,7 +1034,6 @@ int main(int argc, char ** argv) {
    opt_params_adam.adam.gclip              = params.common.adam_gclip;
    opt_params_adam.adam.eps_f              = params.common.adam_eps_f;

-    opt->ctx = model.ctx;
    opt->params = opt_params_adam;

    printf("%s: init model\n", __func__);
@ -1000,7 +1041,6 @@ int main(int argc, char ** argv) {
    if (!existed) {
        init_model(&model);
    }
-    set_param_model(&model);

    opt->params = opt_params_adam;

@ -1012,8 +1052,7 @@ int main(int argc, char ** argv) {
        randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
    }

-    printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
-    // ggml_print_tensor_objects(model.ctx);
+    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));

    // TODO: use std::vector<uint8_t> intead of "new"
    size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
@ -1024,7 +1063,6 @@ int main(int argc, char ** argv) {

    ggml_allocr * alloc = NULL;
    if (params.use_alloc) {
-        static const size_t tensor_alignment = 32;
        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
    }

@ -1206,6 +1244,7 @@ int main(int argc, char ** argv) {

    delete[] compute_addr;
    delete[] compute_buf_0;
+    ggml_free(opt->ctx);
    free_train_state(train);
    ggml_free(model.ctx);
    llama_free(lctx);