From 9e10fa977e2471dd390477dfb702b81f22b54582 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 17 Sep 2023 17:08:36 +0200 Subject: [PATCH] train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N' --- examples/finetune/finetune.cpp | 12 +- .../train-text-from-scratch.cpp | 129 ++++++++++++------ 2 files changed, 90 insertions(+), 51 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 3f0e2be7b..c43d00dfd 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1658,8 +1658,8 @@ int main(int argc, char ** argv) { printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f)); - printf("%s: max_opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); + printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); opt->iter = train->train_its; if (params.only_write_lora) { @@ -1686,7 +1686,7 @@ int main(int argc, char ** argv) { printf("%s: opt iter %d\n", __func__, opt->iter); - printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx)); + printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size()); std::vector mem_input_data; std::vector mem_compute_data; @@ -1709,7 +1709,7 @@ int main(int argc, char ** argv) { ggml_allocr_alloc(alloc, target_probs); size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment; ggml_allocr_free(alloc); - printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); // allocate input tensors mem_input_data.resize(max_input_size); @@ -1769,7 +1769,7 @@ int main(int argc, char ** argv) { ggml_free(ctx_compute); } size_t max_compute_size = best_compute_size; - printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); + printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); printf("%s: evaluation order = %s\n", __func__, (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : @@ -1887,7 +1887,7 @@ int main(int argc, char ** argv) { // measure required memory for work buffer size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; - printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); + printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); // context for work buffer struct ggml_init_params ctx_work_params = { diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 0da7ec11b..83e156363 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -19,6 +19,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static const size_t tensor_alignment = 32; + struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -56,6 +58,7 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::vector data; my_llama_hparams hparams; @@ -118,6 +121,65 @@ static void print_params(struct my_llama_hparams * params) { printf("%s: n_rot: %d\n", __func__, params->n_rot); } +static void set_param_model(struct my_llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) { + ggml_allocr_alloc(alloc, model->tok_embeddings); + ggml_allocr_alloc(alloc, model->norm); + ggml_allocr_alloc(alloc, model->output); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm); + ggml_allocr_alloc(alloc, layer.wq); + ggml_allocr_alloc(alloc, layer.wk); + ggml_allocr_alloc(alloc, layer.wv); + ggml_allocr_alloc(alloc, layer.wo); + ggml_allocr_alloc(alloc, layer.ffn_norm); + ggml_allocr_alloc(alloc, layer.w1); + ggml_allocr_alloc(alloc, layer.w2); + ggml_allocr_alloc(alloc, layer.w3); + } + ggml_allocr_alloc(alloc, model->tok_embeddings->grad); + ggml_allocr_alloc(alloc, model->norm->grad); + ggml_allocr_alloc(alloc, model->output->grad); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm->grad); + ggml_allocr_alloc(alloc, layer.wq->grad); + ggml_allocr_alloc(alloc, layer.wk->grad); + ggml_allocr_alloc(alloc, layer.wv->grad); + ggml_allocr_alloc(alloc, layer.wo->grad); + ggml_allocr_alloc(alloc, layer.ffn_norm->grad); + ggml_allocr_alloc(alloc, layer.w1->grad); + ggml_allocr_alloc(alloc, layer.w2->grad); + ggml_allocr_alloc(alloc, layer.w3->grad); + } +} + static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; @@ -126,7 +188,6 @@ static void init_model(struct my_llama_model * model) { const uint32_t n_vocab = hparams.n_vocab; const uint32_t n_ff = hparams.n_ff; - struct ggml_context * ctx = model->ctx; std::vector tn_buf; tn_buf.resize(GGML_MAX_NAME); @@ -141,6 +202,15 @@ static void init_model(struct my_llama_model * model) { return tn_buf.data(); }; + // context for model tensors without their data + struct ggml_init_params ctx_model_params; + ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); + ctx_model_params.mem_buffer = NULL; + ctx_model_params.no_alloc = true; + + struct ggml_context * ctx = ggml_init(ctx_model_params); + model->ctx = ctx; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); @@ -179,32 +249,20 @@ static void init_model(struct my_llama_model * model) { ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); } -} -static void set_param_model(struct my_llama_model * model) { - const auto& hparams = model->hparams; + set_param_model(model); - const uint32_t n_layer = hparams.n_layer; + // measure data size + struct ggml_allocr * alloc = NULL; + alloc = ggml_allocr_new_measure(tensor_alignment); + alloc_model(alloc, model); - struct ggml_context* ctx = model->ctx; - - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->output); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wq); - ggml_set_param(ctx, layer.wk); - ggml_set_param(ctx, layer.wv); - ggml_set_param(ctx, layer.wo); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); - } + // allocate data + model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment); + ggml_allocr_free(alloc); + alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment); + alloc_model(alloc, model); + ggml_allocr_free(alloc); } static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { @@ -720,7 +778,6 @@ struct train_params { bool use_alloc; - int mem_model_gb; int mem_compute_gb; int mem_compute0_gb; }; @@ -747,7 +804,6 @@ struct train_params get_default_train_params() { params.use_alloc = true; - params.mem_model_gb = 2; params.mem_compute_gb = 24; params.mem_compute0_gb = 8; return params; @@ -772,7 +828,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); fprintf(stderr, " --no-alloc Don't use allocator\n"); fprintf(stderr, " --use-alloc Use allocator (default)\n"); - fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb); @@ -868,12 +923,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par params->use_alloc = false; } else if (arg == "--use-alloc") { params->use_alloc = true; - } else if (arg == "--mem-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_model_gb = std::stoi(argv[i]); } else if (arg == "--mem-compute") { if (++i >= argc) { invalid_param = true; @@ -960,13 +1009,6 @@ int main(int argc, char ** argv) { print_params(&model.hparams); - struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); - lcparams.mem_buffer = NULL; - lcparams.no_alloc = false; - - model.ctx = ggml_init(lcparams); - int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; @@ -992,7 +1034,6 @@ int main(int argc, char ** argv) { opt_params_adam.adam.gclip = params.common.adam_gclip; opt_params_adam.adam.eps_f = params.common.adam_eps_f; - opt->ctx = model.ctx; opt->params = opt_params_adam; printf("%s: init model\n", __func__); @@ -1000,7 +1041,6 @@ int main(int argc, char ** argv) { if (!existed) { init_model(&model); } - set_param_model(&model); opt->params = opt_params_adam; @@ -1012,8 +1052,7 @@ int main(int argc, char ** argv) { randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); } - printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); - // ggml_print_tensor_objects(model.ctx); + printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); // TODO: use std::vector intead of "new" size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); @@ -1024,7 +1063,6 @@ int main(int argc, char ** argv) { ggml_allocr * alloc = NULL; if (params.use_alloc) { - static const size_t tensor_alignment = 32; alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment); } @@ -1206,6 +1244,7 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; + ggml_free(opt->ctx); free_train_state(train); ggml_free(model.ctx); llama_free(lctx);