train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
This commit is contained in:
parent
dd94ce4ec0
commit
9e10fa977e
2 changed files with 90 additions and 51 deletions
|
@ -1658,8 +1658,8 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
||||||
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
||||||
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
||||||
printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
|
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
|
||||||
printf("%s: max_opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
|
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
|
||||||
opt->iter = train->train_its;
|
opt->iter = train->train_its;
|
||||||
|
|
||||||
if (params.only_write_lora) {
|
if (params.only_write_lora) {
|
||||||
|
@ -1686,7 +1686,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("%s: opt iter %d\n", __func__, opt->iter);
|
printf("%s: opt iter %d\n", __func__, opt->iter);
|
||||||
|
|
||||||
printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
|
printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size());
|
||||||
|
|
||||||
std::vector<uint8_t> mem_input_data;
|
std::vector<uint8_t> mem_input_data;
|
||||||
std::vector<uint8_t> mem_compute_data;
|
std::vector<uint8_t> mem_compute_data;
|
||||||
|
@ -1709,7 +1709,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_allocr_alloc(alloc, target_probs);
|
ggml_allocr_alloc(alloc, target_probs);
|
||||||
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
||||||
ggml_allocr_free(alloc);
|
ggml_allocr_free(alloc);
|
||||||
printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
||||||
|
|
||||||
// allocate input tensors
|
// allocate input tensors
|
||||||
mem_input_data.resize(max_input_size);
|
mem_input_data.resize(max_input_size);
|
||||||
|
@ -1769,7 +1769,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_free(ctx_compute);
|
ggml_free(ctx_compute);
|
||||||
}
|
}
|
||||||
size_t max_compute_size = best_compute_size;
|
size_t max_compute_size = best_compute_size;
|
||||||
printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
|
printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
|
||||||
printf("%s: evaluation order = %s\n", __func__,
|
printf("%s: evaluation order = %s\n", __func__,
|
||||||
(best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
|
(best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
|
||||||
(best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
|
(best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
|
||||||
|
@ -1887,7 +1887,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// measure required memory for work buffer
|
// measure required memory for work buffer
|
||||||
size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
|
size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
|
||||||
printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
|
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
|
||||||
|
|
||||||
// context for work buffer
|
// context for work buffer
|
||||||
struct ggml_init_params ctx_work_params = {
|
struct ggml_init_params ctx_work_params = {
|
||||||
|
|
|
@ -19,6 +19,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const size_t tensor_alignment = 32;
|
||||||
|
|
||||||
struct my_llama_hparams {
|
struct my_llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512;
|
uint32_t n_ctx = 512;
|
||||||
|
@ -56,6 +58,7 @@ struct my_llama_layer {
|
||||||
|
|
||||||
struct my_llama_model {
|
struct my_llama_model {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
|
||||||
my_llama_hparams hparams;
|
my_llama_hparams hparams;
|
||||||
|
|
||||||
|
@ -118,6 +121,65 @@ static void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void set_param_model(struct my_llama_model * model) {
|
||||||
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
struct ggml_context* ctx = model->ctx;
|
||||||
|
|
||||||
|
ggml_set_param(ctx, model->tok_embeddings);
|
||||||
|
ggml_set_param(ctx, model->norm);
|
||||||
|
ggml_set_param(ctx, model->output);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model->layers[i];
|
||||||
|
|
||||||
|
ggml_set_param(ctx, layer.attention_norm);
|
||||||
|
ggml_set_param(ctx, layer.wq);
|
||||||
|
ggml_set_param(ctx, layer.wk);
|
||||||
|
ggml_set_param(ctx, layer.wv);
|
||||||
|
ggml_set_param(ctx, layer.wo);
|
||||||
|
ggml_set_param(ctx, layer.ffn_norm);
|
||||||
|
ggml_set_param(ctx, layer.w1);
|
||||||
|
ggml_set_param(ctx, layer.w2);
|
||||||
|
ggml_set_param(ctx, layer.w3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
|
||||||
|
ggml_allocr_alloc(alloc, model->tok_embeddings);
|
||||||
|
ggml_allocr_alloc(alloc, model->norm);
|
||||||
|
ggml_allocr_alloc(alloc, model->output);
|
||||||
|
for (uint32_t i = 0; i < model->layers.size(); ++i) {
|
||||||
|
auto & layer = model->layers[i];
|
||||||
|
ggml_allocr_alloc(alloc, layer.attention_norm);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wq);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wk);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wv);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wo);
|
||||||
|
ggml_allocr_alloc(alloc, layer.ffn_norm);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w1);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w2);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w3);
|
||||||
|
}
|
||||||
|
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
|
||||||
|
ggml_allocr_alloc(alloc, model->norm->grad);
|
||||||
|
ggml_allocr_alloc(alloc, model->output->grad);
|
||||||
|
for (uint32_t i = 0; i < model->layers.size(); ++i) {
|
||||||
|
auto & layer = model->layers[i];
|
||||||
|
ggml_allocr_alloc(alloc, layer.attention_norm->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wq->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wk->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wv->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.wo->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w1->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w2->grad);
|
||||||
|
ggml_allocr_alloc(alloc, layer.w3->grad);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void init_model(struct my_llama_model * model) {
|
static void init_model(struct my_llama_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
|
@ -126,7 +188,6 @@ static void init_model(struct my_llama_model * model) {
|
||||||
const uint32_t n_vocab = hparams.n_vocab;
|
const uint32_t n_vocab = hparams.n_vocab;
|
||||||
const uint32_t n_ff = hparams.n_ff;
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
struct ggml_context * ctx = model->ctx;
|
|
||||||
|
|
||||||
std::vector<char> tn_buf;
|
std::vector<char> tn_buf;
|
||||||
tn_buf.resize(GGML_MAX_NAME);
|
tn_buf.resize(GGML_MAX_NAME);
|
||||||
|
@ -141,6 +202,15 @@ static void init_model(struct my_llama_model * model) {
|
||||||
return tn_buf.data();
|
return tn_buf.data();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// context for model tensors without their data
|
||||||
|
struct ggml_init_params ctx_model_params;
|
||||||
|
ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18);
|
||||||
|
ctx_model_params.mem_buffer = NULL;
|
||||||
|
ctx_model_params.no_alloc = true;
|
||||||
|
|
||||||
|
struct ggml_context * ctx = ggml_init(ctx_model_params);
|
||||||
|
model->ctx = ctx;
|
||||||
|
|
||||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
|
@ -179,32 +249,20 @@ static void init_model(struct my_llama_model * model) {
|
||||||
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
|
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||||
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
|
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
static void set_param_model(struct my_llama_model * model) {
|
set_param_model(model);
|
||||||
const auto& hparams = model->hparams;
|
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
// measure data size
|
||||||
|
struct ggml_allocr * alloc = NULL;
|
||||||
|
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
|
alloc_model(alloc, model);
|
||||||
|
|
||||||
struct ggml_context* ctx = model->ctx;
|
// allocate data
|
||||||
|
model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
|
||||||
ggml_set_param(ctx, model->tok_embeddings);
|
ggml_allocr_free(alloc);
|
||||||
ggml_set_param(ctx, model->norm);
|
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
||||||
ggml_set_param(ctx, model->output);
|
alloc_model(alloc, model);
|
||||||
|
ggml_allocr_free(alloc);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
||||||
auto & layer = model->layers[i];
|
|
||||||
|
|
||||||
ggml_set_param(ctx, layer.attention_norm);
|
|
||||||
ggml_set_param(ctx, layer.wq);
|
|
||||||
ggml_set_param(ctx, layer.wk);
|
|
||||||
ggml_set_param(ctx, layer.wv);
|
|
||||||
ggml_set_param(ctx, layer.wo);
|
|
||||||
ggml_set_param(ctx, layer.ffn_norm);
|
|
||||||
ggml_set_param(ctx, layer.w1);
|
|
||||||
ggml_set_param(ctx, layer.w2);
|
|
||||||
ggml_set_param(ctx, layer.w3);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
|
@ -720,7 +778,6 @@ struct train_params {
|
||||||
|
|
||||||
bool use_alloc;
|
bool use_alloc;
|
||||||
|
|
||||||
int mem_model_gb;
|
|
||||||
int mem_compute_gb;
|
int mem_compute_gb;
|
||||||
int mem_compute0_gb;
|
int mem_compute0_gb;
|
||||||
};
|
};
|
||||||
|
@ -747,7 +804,6 @@ struct train_params get_default_train_params() {
|
||||||
|
|
||||||
params.use_alloc = true;
|
params.use_alloc = true;
|
||||||
|
|
||||||
params.mem_model_gb = 2;
|
|
||||||
params.mem_compute_gb = 24;
|
params.mem_compute_gb = 24;
|
||||||
params.mem_compute0_gb = 8;
|
params.mem_compute0_gb = 8;
|
||||||
return params;
|
return params;
|
||||||
|
@ -772,7 +828,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
|
||||||
fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval);
|
fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval);
|
||||||
fprintf(stderr, " --no-alloc Don't use allocator\n");
|
fprintf(stderr, " --no-alloc Don't use allocator\n");
|
||||||
fprintf(stderr, " --use-alloc Use allocator (default)\n");
|
fprintf(stderr, " --use-alloc Use allocator (default)\n");
|
||||||
fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
|
|
||||||
fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
|
fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
|
||||||
fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
|
fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
|
||||||
|
|
||||||
|
@ -868,12 +923,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
||||||
params->use_alloc = false;
|
params->use_alloc = false;
|
||||||
} else if (arg == "--use-alloc") {
|
} else if (arg == "--use-alloc") {
|
||||||
params->use_alloc = true;
|
params->use_alloc = true;
|
||||||
} else if (arg == "--mem-model") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params->mem_model_gb = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "--mem-compute") {
|
} else if (arg == "--mem-compute") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -960,13 +1009,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_params(&model.hparams);
|
print_params(&model.hparams);
|
||||||
|
|
||||||
struct ggml_init_params lcparams;
|
|
||||||
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
|
|
||||||
lcparams.mem_buffer = NULL;
|
|
||||||
lcparams.no_alloc = false;
|
|
||||||
|
|
||||||
model.ctx = ggml_init(lcparams);
|
|
||||||
|
|
||||||
int n_tokens = model.hparams.n_ctx;
|
int n_tokens = model.hparams.n_ctx;
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
int n_batch = params.common.n_batch;
|
int n_batch = params.common.n_batch;
|
||||||
|
@ -992,7 +1034,6 @@ int main(int argc, char ** argv) {
|
||||||
opt_params_adam.adam.gclip = params.common.adam_gclip;
|
opt_params_adam.adam.gclip = params.common.adam_gclip;
|
||||||
opt_params_adam.adam.eps_f = params.common.adam_eps_f;
|
opt_params_adam.adam.eps_f = params.common.adam_eps_f;
|
||||||
|
|
||||||
opt->ctx = model.ctx;
|
|
||||||
opt->params = opt_params_adam;
|
opt->params = opt_params_adam;
|
||||||
|
|
||||||
printf("%s: init model\n", __func__);
|
printf("%s: init model\n", __func__);
|
||||||
|
@ -1000,7 +1041,6 @@ int main(int argc, char ** argv) {
|
||||||
if (!existed) {
|
if (!existed) {
|
||||||
init_model(&model);
|
init_model(&model);
|
||||||
}
|
}
|
||||||
set_param_model(&model);
|
|
||||||
|
|
||||||
opt->params = opt_params_adam;
|
opt->params = opt_params_adam;
|
||||||
|
|
||||||
|
@ -1012,8 +1052,7 @@ int main(int argc, char ** argv) {
|
||||||
randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
|
randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
|
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
|
||||||
// ggml_print_tensor_objects(model.ctx);
|
|
||||||
|
|
||||||
// TODO: use std::vector<uint8_t> intead of "new"
|
// TODO: use std::vector<uint8_t> intead of "new"
|
||||||
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
||||||
|
@ -1024,7 +1063,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ggml_allocr * alloc = NULL;
|
ggml_allocr * alloc = NULL;
|
||||||
if (params.use_alloc) {
|
if (params.use_alloc) {
|
||||||
static const size_t tensor_alignment = 32;
|
|
||||||
alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
|
alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1206,6 +1244,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
delete[] compute_addr;
|
delete[] compute_addr;
|
||||||
delete[] compute_buf_0;
|
delete[] compute_buf_0;
|
||||||
|
ggml_free(opt->ctx);
|
||||||
free_train_state(train);
|
free_train_state(train);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue