train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'

This commit is contained in:
xaedes 2023-09-17 17:08:36 +02:00
parent dd94ce4ec0
commit 9e10fa977e
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1
2 changed files with 90 additions and 51 deletions

View file

@ -1658,8 +1658,8 @@ int main(int argc, char ** argv) {
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
printf("%s: max_opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
opt->iter = train->train_its;
if (params.only_write_lora) {
@ -1686,7 +1686,7 @@ int main(int argc, char ** argv) {
printf("%s: opt iter %d\n", __func__, opt->iter);
printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size());
std::vector<uint8_t> mem_input_data;
std::vector<uint8_t> mem_compute_data;
@ -1709,7 +1709,7 @@ int main(int argc, char ** argv) {
ggml_allocr_alloc(alloc, target_probs);
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors
mem_input_data.resize(max_input_size);
@ -1769,7 +1769,7 @@ int main(int argc, char ** argv) {
ggml_free(ctx_compute);
}
size_t max_compute_size = best_compute_size;
printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
printf("%s: evaluation order = %s\n", __func__,
(best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
(best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
@ -1887,7 +1887,7 @@ int main(int argc, char ** argv) {
// measure required memory for work buffer
size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
// context for work buffer
struct ggml_init_params ctx_work_params = {

View file

@ -19,6 +19,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static const size_t tensor_alignment = 32;
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512;
@ -56,6 +58,7 @@ struct my_llama_layer {
struct my_llama_model {
struct ggml_context * ctx = NULL;
std::vector<uint8_t> data;
my_llama_hparams hparams;
@ -118,6 +121,65 @@ static void print_params(struct my_llama_hparams * params) {
printf("%s: n_rot: %d\n", __func__, params->n_rot);
}
static void set_param_model(struct my_llama_model * model) {
const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer;
struct ggml_context* ctx = model->ctx;
ggml_set_param(ctx, model->tok_embeddings);
ggml_set_param(ctx, model->norm);
ggml_set_param(ctx, model->output);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
ggml_set_param(ctx, layer.attention_norm);
ggml_set_param(ctx, layer.wq);
ggml_set_param(ctx, layer.wk);
ggml_set_param(ctx, layer.wv);
ggml_set_param(ctx, layer.wo);
ggml_set_param(ctx, layer.ffn_norm);
ggml_set_param(ctx, layer.w1);
ggml_set_param(ctx, layer.w2);
ggml_set_param(ctx, layer.w3);
}
}
static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
ggml_allocr_alloc(alloc, model->tok_embeddings);
ggml_allocr_alloc(alloc, model->norm);
ggml_allocr_alloc(alloc, model->output);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm);
ggml_allocr_alloc(alloc, layer.wq);
ggml_allocr_alloc(alloc, layer.wk);
ggml_allocr_alloc(alloc, layer.wv);
ggml_allocr_alloc(alloc, layer.wo);
ggml_allocr_alloc(alloc, layer.ffn_norm);
ggml_allocr_alloc(alloc, layer.w1);
ggml_allocr_alloc(alloc, layer.w2);
ggml_allocr_alloc(alloc, layer.w3);
}
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
ggml_allocr_alloc(alloc, model->norm->grad);
ggml_allocr_alloc(alloc, model->output->grad);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm->grad);
ggml_allocr_alloc(alloc, layer.wq->grad);
ggml_allocr_alloc(alloc, layer.wk->grad);
ggml_allocr_alloc(alloc, layer.wv->grad);
ggml_allocr_alloc(alloc, layer.wo->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
ggml_allocr_alloc(alloc, layer.w1->grad);
ggml_allocr_alloc(alloc, layer.w2->grad);
ggml_allocr_alloc(alloc, layer.w3->grad);
}
}
static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams;
@ -126,7 +188,6 @@ static void init_model(struct my_llama_model * model) {
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_ff = hparams.n_ff;
struct ggml_context * ctx = model->ctx;
std::vector<char> tn_buf;
tn_buf.resize(GGML_MAX_NAME);
@ -141,6 +202,15 @@ static void init_model(struct my_llama_model * model) {
return tn_buf.data();
};
// context for model tensors without their data
struct ggml_init_params ctx_model_params;
ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18);
ctx_model_params.mem_buffer = NULL;
ctx_model_params.no_alloc = true;
struct ggml_context * ctx = ggml_init(ctx_model_params);
model->ctx = ctx;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
@ -179,32 +249,20 @@ static void init_model(struct my_llama_model * model) {
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
}
}
static void set_param_model(struct my_llama_model * model) {
const auto& hparams = model->hparams;
set_param_model(model);
const uint32_t n_layer = hparams.n_layer;
// measure data size
struct ggml_allocr * alloc = NULL;
alloc = ggml_allocr_new_measure(tensor_alignment);
alloc_model(alloc, model);
struct ggml_context* ctx = model->ctx;
ggml_set_param(ctx, model->tok_embeddings);
ggml_set_param(ctx, model->norm);
ggml_set_param(ctx, model->output);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
ggml_set_param(ctx, layer.attention_norm);
ggml_set_param(ctx, layer.wq);
ggml_set_param(ctx, layer.wk);
ggml_set_param(ctx, layer.wv);
ggml_set_param(ctx, layer.wo);
ggml_set_param(ctx, layer.ffn_norm);
ggml_set_param(ctx, layer.w1);
ggml_set_param(ctx, layer.w2);
ggml_set_param(ctx, layer.w3);
}
// allocate data
model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
ggml_allocr_free(alloc);
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
alloc_model(alloc, model);
ggml_allocr_free(alloc);
}
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@ -720,7 +778,6 @@ struct train_params {
bool use_alloc;
int mem_model_gb;
int mem_compute_gb;
int mem_compute0_gb;
};
@ -747,7 +804,6 @@ struct train_params get_default_train_params() {
params.use_alloc = true;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
return params;
@ -772,7 +828,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval);
fprintf(stderr, " --no-alloc Don't use allocator\n");
fprintf(stderr, " --use-alloc Use allocator (default)\n");
fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
@ -868,12 +923,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
params->use_alloc = false;
} else if (arg == "--use-alloc") {
params->use_alloc = true;
} else if (arg == "--mem-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->mem_model_gb = std::stoi(argv[i]);
} else if (arg == "--mem-compute") {
if (++i >= argc) {
invalid_param = true;
@ -960,13 +1009,6 @@ int main(int argc, char ** argv) {
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
lcparams.no_alloc = false;
model.ctx = ggml_init(lcparams);
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch;
@ -992,7 +1034,6 @@ int main(int argc, char ** argv) {
opt_params_adam.adam.gclip = params.common.adam_gclip;
opt_params_adam.adam.eps_f = params.common.adam_eps_f;
opt->ctx = model.ctx;
opt->params = opt_params_adam;
printf("%s: init model\n", __func__);
@ -1000,7 +1041,6 @@ int main(int argc, char ** argv) {
if (!existed) {
init_model(&model);
}
set_param_model(&model);
opt->params = opt_params_adam;
@ -1012,8 +1052,7 @@ int main(int argc, char ** argv) {
randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
}
printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
// ggml_print_tensor_objects(model.ctx);
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
// TODO: use std::vector<uint8_t> intead of "new"
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
@ -1024,7 +1063,6 @@ int main(int argc, char ** argv) {
ggml_allocr * alloc = NULL;
if (params.use_alloc) {
static const size_t tensor_alignment = 32;
alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
}
@ -1206,6 +1244,7 @@ int main(int argc, char ** argv) {
delete[] compute_addr;
delete[] compute_buf_0;
ggml_free(opt->ctx);
free_train_state(train);
ggml_free(model.ctx);
llama_free(lctx);