diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 77d0cacda..38fba3b8d 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -17,8 +17,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; - struct random_normal_distribution { std::mt19937 gen; std::normal_distribution rd; @@ -195,11 +193,13 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; - uint32_t n_mult = 4; + uint32_t n_ff = 11008; uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + float f_rms_norm_eps = 1e-5f; + bool operator!=(const my_llama_hparams& other) const { return memcmp(this, &other, sizeof(other)); } @@ -304,18 +304,12 @@ struct my_llama_lora { uint32_t train_tokens = 0; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %u\n", __func__, params->n_vocab); printf("%s: n_ctx: %u\n", __func__, params->n_ctx); printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_mult: %u\n", __func__, params->n_mult); + printf("%s: n_ff: %u\n", __func__, params->n_ff); printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_ff: %u\n", __func__, get_n_ff(params)); printf("%s: n_layer: %u\n", __func__, params->n_layer); printf("%s: n_rot: %u\n", __func__, params->n_rot); } @@ -338,19 +332,18 @@ void print_lora_params(struct my_llama_lora_hparams * params) { void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) { auto & hparams = model->hparams; - hparams.n_vocab = llama_n_vocab_from_model(input); + hparams.n_vocab = llama_model_n_vocab(input); hparams.n_ctx = n_ctx; - hparams.n_embd = llama_n_embd_from_model(input); - hparams.n_mult = llama_n_mult_from_model(input); - hparams.n_head = llama_n_head_from_model(input); - hparams.n_layer = llama_n_layer_from_model(input); - hparams.n_rot = llama_n_rot_from_model(input); + hparams.n_embd = llama_model_n_embd(input); + hparams.n_ff = llama_model_n_ff(input); + hparams.n_head = llama_model_n_head(input); + hparams.n_layer = llama_model_n_layer(input); + hparams.n_rot = llama_model_n_rot(input); const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; model->tok_embeddings = llama_get_model_tensor(input, "tok_embeddings.weight"); model->norm = llama_get_model_tensor(input, "norm.weight"); @@ -398,7 +391,7 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) const uint32_t n_embd = model->hparams.n_embd; const uint32_t n_layer = model->hparams.n_layer; const uint32_t n_vocab = model->hparams.n_vocab; - const uint32_t n_ff = get_n_ff(&model->hparams); + const uint32_t n_ff = model->hparams.n_ff; struct ggml_context * ctx = lora->ctx; @@ -603,6 +596,8 @@ struct ggml_tensor * forward( const int n_head = hparams.n_head; const int n_rot = hparams.n_rot; + const float rms_norm_eps = hparams.f_rms_norm_eps; + GGML_ASSERT(n_layer == lora->layers.size()); struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); @@ -1082,7 +1077,8 @@ struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_layer = hparams.n_layer; const int n_head = hparams.n_head; const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); + const int n_ff = hparams.n_ff; + const float rms_norm_eps = hparams.f_rms_norm_eps; const int rope_mode = 0; GGML_ASSERT(n_layer == lora->layers.size()); @@ -1317,7 +1313,7 @@ void print_matrix(struct ggml_tensor * probs) { void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token)); + printf("%s", llama_token_get_text(ctx, token)); } void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { @@ -1351,7 +1347,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } } -void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { +void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -1360,7 +1356,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_probs, 0.0f); - ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); + ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx)); for (int i=1; in_dims == 2); GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3); @@ -1394,7 +1390,7 @@ void get_example_targets_batch(const int * train_samples, size_t n_train_samples // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample); GGML_ASSERT(sample+n_tokens-1 < n_train_data); - set_i32_2d(tokens_input, 0, k, llama_token_bos()); + set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx)); for (int i=1; i= end) { printf("%s: unexpected end of original text.\n", __func__); @@ -1617,7 +1613,7 @@ void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; } -llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { +llama_token sample(struct llama_context * lctx, struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { GGML_ASSERT(sampler->ctx != NULL); struct llama_context * ctx = sampler->ctx; @@ -1638,7 +1634,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam const auto params = sampler->params; // Apply penalties - const float nl_logit = logits[llama_token_nl()]; + const float nl_logit = logits[llama_token_nl(lctx)]; const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); @@ -1657,7 +1653,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam params.presence_penalty); if (!params.penalize_nl) { - logits[llama_token_nl()] = nl_logit; + logits[llama_token_nl(lctx)] = nl_logit; } llama_token token = 0; @@ -1884,7 +1880,7 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, file.write_u32(lora->train_tokens); file.write_u32(model->hparams.n_vocab); file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); + //file.write_u32(model->hparams.n_mult); file.write_u32(model->hparams.n_head); file.write_u32(model->hparams.n_layer); file.write_u32(model->hparams.n_rot); @@ -1961,7 +1957,7 @@ bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, uint32_t n_rot = file.read_u32(); GGML_ASSERT(n_vocab == model->hparams.n_vocab); GGML_ASSERT(n_embd == model->hparams.n_embd); - GGML_ASSERT(n_mult == model->hparams.n_mult); + //GGML_ASSERT(n_mult == model->hparams.n_mult); GGML_ASSERT(n_head == model->hparams.n_head); GGML_ASSERT(n_layer == model->hparams.n_layer); GGML_ASSERT(n_rot == model->hparams.n_rot); @@ -2042,8 +2038,9 @@ void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, cons return; } + uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic + file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); @@ -2667,6 +2664,7 @@ struct opt_callback_data { struct ggml_opt_context * opt; struct my_llama_model * model; struct my_llama_lora * lora; + struct llama_context * lctx; int last_save_iter; llama_token * tokens_data; size_t tokens_size; @@ -2728,6 +2726,7 @@ void opt_callback(void * vdata, float * sched) { } get_example_targets_batch( + data->lctx, data->samples_data, data->samples_size, data->tokens_data, @@ -2760,24 +2759,24 @@ int main(int argc, char ** argv) { struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params); struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); - struct llama_vocab vocab; - { - std::vector strings; - std::vector scores; - int n_vocab = llama_n_vocab(lctx); - strings.resize(n_vocab, NULL); - scores.resize(n_vocab, 0); - n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); - GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); - vocab.id_to_token.resize(n_vocab); - for (int i=0; i strings; + // std::vector scores; + // int n_vocab = llama_n_vocab(lctx); + // strings.resize(n_vocab, NULL); + // scores.resize(n_vocab, 0); + // n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); + // GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); + // vocab.id_to_token.resize(n_vocab); + // for (int i=0; i train_tokens; @@ -2911,7 +2910,7 @@ int main(int argc, char ** argv) { std::vector train_samples; train_samples.push_back(0); for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { - if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) { + if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) { train_samples.push_back(i); } } @@ -2929,6 +2928,7 @@ int main(int argc, char ** argv) { opt_cb_data.opt = opt; opt_cb_data.model = &model; opt_cb_data.lora = &lora; + opt_cb_data.lctx = lctx; opt_cb_data.last_save_iter = opt->iter; opt_cb_data.tokens_data = train_tokens.data(); opt_cb_data.tokens_size = train_tokens.size(); @@ -3031,7 +3031,7 @@ int main(int argc, char ** argv) { for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), k); @@ -3101,7 +3101,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * target_logits = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab, n_tokens); struct ggml_tensor * target_probs = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); + get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), (llama_token *) tokens_input->data, sample_ctx-1);