diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index b62c19540..aafdbff74 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -18,53 +18,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -uint32_t compute_data_checksum(struct ggml_tensor * tensor) { - const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1; - const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1; - const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1; - const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1; - const size_t nb0 = tensor->nb[0]; - const size_t nb1 = tensor->nb[1]; - const size_t nb2 = tensor->nb[2]; - const size_t nb3 = tensor->nb[3]; - const size_t nb = ggml_element_size(tensor); - uint32_t result = 0; - for (int i3 = 0; i3 < n3; ++i3) { - for (int i2 = 0; i2 < n2; ++i2) { - for (int i1 = 0; i1 < n1; ++i1) { - for (int i0 = 0; i0 < n0; ++i0) { - char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - uint32_t val; - memcpy(&val, ptr, nb); - result = result ^ val; - result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu; - } - } - } - } - return result; -} - -void print_data_checksum(struct ggml_tensor * tensor) { - uint32_t chk = compute_data_checksum(tensor); - printf("%s: chk=[%08x] data=[%p] name=%s\n", __func__, chk, tensor->data, ggml_get_name(tensor)); -} - -void print_data_checksums(struct ggml_cgraph * g) { - for (int i = 0; i < g->n_nodes; ++i) { - struct ggml_tensor * node = g->nodes[i]; - for (int j = 0; jsrc[j]) { - struct ggml_tensor * src = node->src[j]; - uint32_t chk = compute_data_checksum(src); - printf("%s: node[%3d]->src[%d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, j, chk, src->data, ggml_op_name(src->op), ggml_get_name(src)); - } - } - uint32_t chk = compute_data_checksum(node); - printf("%s: node[%3d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, chk, node->data, ggml_op_name(node->op), ggml_get_name(node)); - } -} - struct random_normal_distribution { std::mt19937 gen; std::normal_distribution rd; @@ -1614,12 +1567,6 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - - print_data_checksum(opt->adam.m); - print_data_checksum(opt->adam.v); - if (opt->adam.pf) { - print_data_checksum(opt->adam.pf); - } } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { opt->params.type = GGML_OPT_LBFGS; @@ -1670,12 +1617,6 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } - print_data_checksum(opt->adam.m); - print_data_checksum(opt->adam.v); - if (opt->adam.pf) { - print_data_checksum(opt->adam.pf); - } - gguf_add_tensor(fctx, opt->adam.m); gguf_add_tensor(fctx, opt->adam.v); if (opt->adam.pf) { @@ -1778,10 +1719,6 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); - print_data_checksum(model->tok_embeddings); - print_data_checksum(model->norm); - print_data_checksum(model->output); - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; @@ -1794,16 +1731,6 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); - - print_data_checksum(layer.attention_norm); - print_data_checksum(layer.wq); - print_data_checksum(layer.wk); - print_data_checksum(layer.wv); - print_data_checksum(layer.wo); - print_data_checksum(layer.ffn_norm); - print_data_checksum(layer.w1); - print_data_checksum(layer.w2); - print_data_checksum(layer.w3); } } @@ -1930,10 +1857,6 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod gguf_free(vctx); } - print_data_checksum(model->tok_embeddings); - print_data_checksum(model->norm); - print_data_checksum(model->output); - // add tensors gguf_add_tensor(fctx, model->tok_embeddings); gguf_add_tensor(fctx, model->norm); @@ -1941,15 +1864,6 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - print_data_checksum(layer.attention_norm); - print_data_checksum(layer.wq); - print_data_checksum(layer.wk); - print_data_checksum(layer.wv); - print_data_checksum(layer.wo); - print_data_checksum(layer.ffn_norm); - print_data_checksum(layer.w1); - print_data_checksum(layer.w2); - print_data_checksum(layer.w3); gguf_add_tensor(fctx, layer.attention_norm); gguf_add_tensor(fctx, layer.wq); @@ -2025,321 +1939,6 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st gguf_free(fctx); } -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n", - __func__, "(empty tensor)", file->tell(), (size_t) 0); - return; - } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - printf("%s: write tensor name='%s' begin offset='%zu'\n", - __func__, name, file->tell()); - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n", - __func__, name, file->tell(), ggml_nbytes(tensor)); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - -struct ggml_opt_params_v0 { - enum ggml_opt_type type; - int n_threads; - int past; - float delta; - int max_no_improvement; - bool print_forward_graph; - bool print_backward_graph; - struct { - int n_iter; - float sched; - float decay; - float alpha; - float beta1; - float beta2; - float eps; - float eps_f; - float eps_g; - } adam; - struct { - int m; - int n_iter; - int max_linesearch; - float eps; - float ftol; - float wolfe; - float min_step; - float max_step; - enum ggml_linesearch linesearch; - } lbfgs; -}; - -void write_opt_context_v0(struct llama_file * file, struct ggml_opt_context * opt) { - const uint32_t version = 0; - GGML_ASSERT(opt->nx >= 0); - GGML_ASSERT(opt->iter >= 0); - file->write_u32(version); - ggml_opt_params_v0 params_v0; - params_v0.type = opt->params.type; - params_v0.n_threads = opt->params.n_threads; - params_v0.past = opt->params.past; - params_v0.delta = opt->params.delta; - params_v0.max_no_improvement = opt->params.max_no_improvement; - params_v0.print_forward_graph = opt->params.print_forward_graph; - params_v0.print_backward_graph = opt->params.print_backward_graph; - params_v0.adam.n_iter = opt->params.adam.n_iter; - params_v0.adam.sched = opt->params.adam.sched; - params_v0.adam.decay = opt->params.adam.decay; - params_v0.adam.alpha = opt->params.adam.alpha; - params_v0.adam.beta1 = opt->params.adam.beta1; - params_v0.adam.beta2 = opt->params.adam.beta2; - params_v0.adam.eps = opt->params.adam.eps; - params_v0.adam.eps_f = opt->params.adam.eps_f; - params_v0.adam.eps_g = opt->params.adam.eps_g; - params_v0.lbfgs.m = opt->params.lbfgs.m; - params_v0.lbfgs.n_iter = opt->params.lbfgs.n_iter; - params_v0.lbfgs.max_linesearch = opt->params.lbfgs.max_linesearch; - params_v0.lbfgs.eps = opt->params.lbfgs.eps; - params_v0.lbfgs.ftol = opt->params.lbfgs.ftol; - params_v0.lbfgs.wolfe = opt->params.lbfgs.wolfe; - params_v0.lbfgs.min_step = opt->params.lbfgs.min_step; - params_v0.lbfgs.max_step = opt->params.lbfgs.max_step; - file->write_raw(¶ms_v0, sizeof(params_v0)); - file->write_raw(&opt->nx, sizeof(opt->nx)); - file->write_raw(&opt->iter, sizeof(opt->iter)); - file->write_u32((uint32_t) opt->just_initialized); - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - struct ggml_tensor * adam_x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx); - struct ggml_tensor * adam_g1 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx); - struct ggml_tensor * adam_g2 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx); - struct ggml_tensor * adam_mh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx); - struct ggml_tensor * adam_vh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx); - write_tensor(file, adam_x); - write_tensor(file, adam_g1); - write_tensor(file, adam_g2); - write_tensor(file, opt->adam.m); - write_tensor(file, opt->adam.v); - write_tensor(file, adam_mh); - write_tensor(file, adam_vh); - write_tensor(file, opt->adam.pf); - file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); - } break; - case GGML_OPT_LBFGS: - { - write_tensor(file, opt->lbfgs.x); - write_tensor(file, opt->lbfgs.xp); - write_tensor(file, opt->lbfgs.g); - write_tensor(file, opt->lbfgs.gp); - write_tensor(file, opt->lbfgs.d); - write_tensor(file, opt->lbfgs.pf); - write_tensor(file, opt->lbfgs.lmal); - write_tensor(file, opt->lbfgs.lmys); - write_tensor(file, opt->lbfgs.lms); - write_tensor(file, opt->lbfgs.lmy); - file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); - } break; - } -} - -void write_opt_context_v1(struct llama_file * file, struct ggml_opt_context * opt) { - const uint32_t version = 1; - GGML_ASSERT(opt->nx >= 0); - GGML_ASSERT(opt->iter >= 0); - file->write_u32(version); - file->write_u32(opt->params.past); - file->write_u32(opt->params.lbfgs.m); - file->write_raw(&opt->nx, sizeof(opt->nx)); - file->write_raw(&opt->iter, sizeof(opt->iter)); - file->write_u32((uint32_t) opt->just_initialized); - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - GGML_ASSERT(opt->adam.m != NULL); - GGML_ASSERT(opt->adam.v != NULL); - write_tensor(file, opt->adam.m); - write_tensor(file, opt->adam.v); - write_tensor(file, opt->adam.pf); - file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); - } break; - case GGML_OPT_LBFGS: - { - GGML_ASSERT(opt->lbfgs.x != NULL); - write_tensor(file, opt->lbfgs.x); - write_tensor(file, opt->lbfgs.xp); - write_tensor(file, opt->lbfgs.g); - write_tensor(file, opt->lbfgs.gp); - write_tensor(file, opt->lbfgs.d); - write_tensor(file, opt->lbfgs.pf); - write_tensor(file, opt->lbfgs.lmal); - write_tensor(file, opt->lbfgs.lmys); - write_tensor(file, opt->lbfgs.lms); - write_tensor(file, opt->lbfgs.lmy); - file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); - } break; - } -} - -void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, int opt_version) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - - const uint32_t magic = 'ggcp'; - const uint32_t version = 0; - - file.write_u32(magic); - file.write_u32(version); - file.write_u32(model->train_its); - file.write_u32(model->train_samples); - file.write_u32(model->train_tokens); - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(/*model->hparams.n_mult*/ 256); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); - } - - if (opt_version == 0) { - write_opt_context_v0(&file, opt); - } else { - write_opt_context_v1(&file, opt); - } - - printf("%s: all written offset='%zu'\n", - __func__, file.tell()); - -} float cosine_decay(const int decay_steps, const float minimum, int step) { if (step > decay_steps) { step = decay_steps; @@ -3190,15 +2789,6 @@ int main(int argc, char ** argv) { printf("%s: total training time=%f seconds\n", __func__, dd); if (params.n_examples > 0) { - for (int opt_version = 0; opt_version < 2; ++opt_version) { - std::string fn_checkpoint_out_old = ( - std::string(params.fn_checkpoint_out) - + std::string(".") - + std::to_string(opt_version) - + std::string(".old.bin")); - save_checkpoint(&model, opt, fn_checkpoint_out_old.c_str(), opt_version); - } - save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt); }