move common opt_callback into common/train
This commit is contained in:
parent
e9758ae1d2
commit
bef1e97875
4 changed files with 146 additions and 287 deletions
115
common/train.cpp
115
common/train.cpp
|
@ -1329,3 +1329,118 @@ void finish_processing_train_args(struct train_params_common * params) {
|
||||||
process_escapes(params->sample_start);
|
process_escapes(params->sample_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void train_opt_callback(void * vdata, int accum_step, float * sched) {
|
||||||
|
struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata;
|
||||||
|
struct train_params_common * params = data->params;
|
||||||
|
struct train_state * train = data->train;
|
||||||
|
struct ggml_opt_context * opt = train->opt;
|
||||||
|
int n_batch = params->n_batch;
|
||||||
|
int n_ctx = params->n_ctx;
|
||||||
|
|
||||||
|
if (accum_step == 0) {
|
||||||
|
// time measurement
|
||||||
|
int64_t now = ggml_time_ms();
|
||||||
|
if (now > data->last_time && opt->iter > data->first_iter) {
|
||||||
|
double dt = (double) (now - data->last_time);
|
||||||
|
if (data->millis_per_iter == 0.0) {
|
||||||
|
data->millis_per_iter = dt;
|
||||||
|
} else {
|
||||||
|
const double gain = 0.7;
|
||||||
|
data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double remaining_millis = 0.0;
|
||||||
|
if (data->millis_per_iter > 0.0) {
|
||||||
|
const int n_iter = params->adam_n_iter;
|
||||||
|
const int done_iter = opt->iter - data->first_iter;
|
||||||
|
const int remaining_iter = n_iter - done_iter;
|
||||||
|
remaining_millis = remaining_iter * data->millis_per_iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
// file saving
|
||||||
|
const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
|
||||||
|
if (save_now) {
|
||||||
|
int new_iters = opt->iter - data->last_save_iter;
|
||||||
|
train->train_its += new_iters;
|
||||||
|
train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
|
||||||
|
train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
|
||||||
|
|
||||||
|
if (data->save_cb) {
|
||||||
|
data->save_cb(data->save_data, train);
|
||||||
|
}
|
||||||
|
|
||||||
|
data->last_save_iter = opt->iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
// exclude file saving from time measurement, by measuring last_time after saving
|
||||||
|
data->last_time = ggml_time_ms();
|
||||||
|
|
||||||
|
*sched = learning_schedule(
|
||||||
|
opt->iter,
|
||||||
|
params->warmup,
|
||||||
|
params->cos_decay_steps,
|
||||||
|
params->adam_alpha,
|
||||||
|
params->adam_min_alpha,
|
||||||
|
params->cos_decay_min,
|
||||||
|
params->cos_decay_restart,
|
||||||
|
params->enable_restart);
|
||||||
|
|
||||||
|
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
||||||
|
if (impr_plot > 0) impr_plot = 0;
|
||||||
|
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
||||||
|
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
||||||
|
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
||||||
|
*sched, opt->loss_after);
|
||||||
|
|
||||||
|
|
||||||
|
if (data->millis_per_iter > 0) {
|
||||||
|
printf(" dt=");
|
||||||
|
print_duration(data->millis_per_iter);
|
||||||
|
printf(" eta=");
|
||||||
|
print_duration(remaining_millis);
|
||||||
|
}
|
||||||
|
|
||||||
|
float improvement = opt->loss_before - opt->loss_after;
|
||||||
|
const float plot_scale = 10.0f;
|
||||||
|
int bar_len = (int)(1 + improvement*plot_scale + 0.5);
|
||||||
|
printf(" |");
|
||||||
|
for (int i=0; i<bar_len; ++i) {
|
||||||
|
printf("-");
|
||||||
|
}
|
||||||
|
printf(">");
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t used_samples = get_example_targets_batch(
|
||||||
|
data->lctx,
|
||||||
|
data->tokens_input,
|
||||||
|
data->target_probs,
|
||||||
|
train->shuffle_next_sample,
|
||||||
|
data->shuffled_samples_begin,
|
||||||
|
data->shuffled_samples_size,
|
||||||
|
data->samples_count,
|
||||||
|
data->tokens_data,
|
||||||
|
data->tokens_size,
|
||||||
|
params->separate_with_eos,
|
||||||
|
params->separate_with_bos,
|
||||||
|
params->fill_with_next_samples);
|
||||||
|
|
||||||
|
train->shuffle_next_sample += used_samples;
|
||||||
|
|
||||||
|
if (train->shuffle_next_sample >= train->shuffle_sample_count) {
|
||||||
|
++train->train_epochs;
|
||||||
|
printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
|
||||||
|
// note: we may have used some samples from the current shuffling more than once
|
||||||
|
train->shuffle_rng_state_current = train->shuffle_rng_state_next;
|
||||||
|
train->shuffle_rng_state_next = shuffle_samples(
|
||||||
|
train->shuffle_rng_state_current,
|
||||||
|
data->shuffled_samples_begin,
|
||||||
|
data->shuffled_samples_size,
|
||||||
|
data->samples_begin,
|
||||||
|
data->samples_size,
|
||||||
|
data->samples_count);
|
||||||
|
train->shuffle_next_sample = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -80,6 +80,29 @@ struct train_params_common {
|
||||||
float adam_eps_f;
|
float adam_eps_f;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
||||||
|
|
||||||
|
struct train_opt_callback_data {
|
||||||
|
struct train_params_common * params;
|
||||||
|
struct train_state * train;
|
||||||
|
save_train_files_callback save_cb;
|
||||||
|
void * save_data;
|
||||||
|
struct llama_context * lctx;
|
||||||
|
int last_save_iter;
|
||||||
|
llama_token * tokens_data;
|
||||||
|
size_t tokens_size;
|
||||||
|
size_t * samples_begin;
|
||||||
|
size_t * samples_size;
|
||||||
|
size_t * shuffled_samples_begin;
|
||||||
|
size_t * shuffled_samples_size;
|
||||||
|
size_t samples_count;
|
||||||
|
struct ggml_tensor * tokens_input;
|
||||||
|
struct ggml_tensor * target_probs;
|
||||||
|
int first_iter;
|
||||||
|
int64_t last_time;
|
||||||
|
double millis_per_iter;
|
||||||
|
};
|
||||||
|
|
||||||
struct train_state * init_train_state(int seed);
|
struct train_state * init_train_state(int seed);
|
||||||
void free_train_state(struct train_state * state);
|
void free_train_state(struct train_state * state);
|
||||||
|
|
||||||
|
@ -195,4 +218,4 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
|
||||||
|
|
||||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||||
|
|
||||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
void train_opt_callback(void * vdata, int accum_step, float * sched);
|
||||||
|
|
|
@ -1512,142 +1512,6 @@ static void save_train_files(void * vdata, struct train_state * train) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct opt_callback_data {
|
|
||||||
struct train_params_common * params;
|
|
||||||
struct train_state * train;
|
|
||||||
save_train_files_callback save_cb;
|
|
||||||
void * save_data;
|
|
||||||
struct llama_context * lctx;
|
|
||||||
int last_save_iter;
|
|
||||||
llama_token * tokens_data;
|
|
||||||
size_t tokens_size;
|
|
||||||
size_t * samples_begin;
|
|
||||||
size_t * samples_size;
|
|
||||||
size_t * shuffled_samples_begin;
|
|
||||||
size_t * shuffled_samples_size;
|
|
||||||
size_t samples_count;
|
|
||||||
struct ggml_tensor * tokens_input;
|
|
||||||
struct ggml_tensor * target_probs;
|
|
||||||
int first_iter;
|
|
||||||
int64_t last_time;
|
|
||||||
double millis_per_iter;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void opt_callback(void * vdata, int accum_step, float * sched) {
|
|
||||||
struct opt_callback_data * data = (struct opt_callback_data *) vdata;
|
|
||||||
struct train_params_common * params = data->params;
|
|
||||||
struct train_state * train = data->train;
|
|
||||||
struct ggml_opt_context * opt = train->opt;
|
|
||||||
int n_batch = params->n_batch;
|
|
||||||
int n_ctx = params->n_ctx;
|
|
||||||
|
|
||||||
if (accum_step == 0) {
|
|
||||||
// time measurement
|
|
||||||
int64_t now = ggml_time_ms();
|
|
||||||
if (now > data->last_time && opt->iter > data->first_iter) {
|
|
||||||
double dt = now - data->last_time;
|
|
||||||
if (data->millis_per_iter == 0.0) {
|
|
||||||
data->millis_per_iter = dt;
|
|
||||||
} else {
|
|
||||||
const double gain = 0.7;
|
|
||||||
data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
double remaining_millis = 0.0;
|
|
||||||
if (data->millis_per_iter > 0.0) {
|
|
||||||
const int n_iter = params->adam_n_iter;
|
|
||||||
const int done_iter = opt->iter - data->first_iter;
|
|
||||||
const int remaining_iter = n_iter - done_iter;
|
|
||||||
remaining_millis = remaining_iter * data->millis_per_iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
// file saving
|
|
||||||
const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
|
|
||||||
if (save_now) {
|
|
||||||
int new_iters = opt->iter - data->last_save_iter;
|
|
||||||
train->train_its += new_iters;
|
|
||||||
train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
|
|
||||||
train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
|
|
||||||
|
|
||||||
if (data->save_cb) {
|
|
||||||
data->save_cb(data->save_data, train);
|
|
||||||
}
|
|
||||||
|
|
||||||
data->last_save_iter = opt->iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
// exclude file saving from time measurement, by measuring last_time after saving
|
|
||||||
data->last_time = ggml_time_ms();
|
|
||||||
|
|
||||||
*sched = learning_schedule(
|
|
||||||
opt->iter,
|
|
||||||
params->warmup,
|
|
||||||
params->cos_decay_steps,
|
|
||||||
params->adam_alpha,
|
|
||||||
params->adam_min_alpha,
|
|
||||||
params->cos_decay_min,
|
|
||||||
params->cos_decay_restart,
|
|
||||||
params->enable_restart);
|
|
||||||
|
|
||||||
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
|
||||||
if (impr_plot > 0) impr_plot = 0;
|
|
||||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
|
||||||
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
|
||||||
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
|
||||||
*sched, opt->loss_after);
|
|
||||||
|
|
||||||
|
|
||||||
if (data->millis_per_iter > 0) {
|
|
||||||
printf(" dt=");
|
|
||||||
print_duration(data->millis_per_iter);
|
|
||||||
printf(" eta=");
|
|
||||||
print_duration(remaining_millis);
|
|
||||||
}
|
|
||||||
|
|
||||||
float improvement = opt->loss_before - opt->loss_after;
|
|
||||||
const float plot_scale = 10.0f;
|
|
||||||
int bar_len = (int)(1 + improvement*plot_scale + 0.5);
|
|
||||||
printf(" |");
|
|
||||||
for (int i=0; i<bar_len; ++i) {
|
|
||||||
printf("-");
|
|
||||||
}
|
|
||||||
printf(">");
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t used_samples = get_example_targets_batch(
|
|
||||||
data->lctx,
|
|
||||||
data->tokens_input,
|
|
||||||
data->target_probs,
|
|
||||||
train->shuffle_next_sample,
|
|
||||||
data->shuffled_samples_begin,
|
|
||||||
data->shuffled_samples_size,
|
|
||||||
data->samples_count,
|
|
||||||
data->tokens_data,
|
|
||||||
data->tokens_size,
|
|
||||||
params->separate_with_eos,
|
|
||||||
params->separate_with_bos,
|
|
||||||
params->fill_with_next_samples);
|
|
||||||
|
|
||||||
train->shuffle_next_sample += used_samples;
|
|
||||||
|
|
||||||
if (train->shuffle_next_sample >= train->shuffle_sample_count) {
|
|
||||||
++train->train_epochs;
|
|
||||||
printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
|
|
||||||
// note: we may have used some samples from the current shuffling more than once
|
|
||||||
train->shuffle_rng_state_current = train->shuffle_rng_state_next;
|
|
||||||
train->shuffle_rng_state_next = shuffle_samples(
|
|
||||||
train->shuffle_rng_state_current,
|
|
||||||
data->shuffled_samples_begin,
|
|
||||||
data->shuffled_samples_size,
|
|
||||||
data->samples_begin,
|
|
||||||
data->samples_size,
|
|
||||||
data->samples_count);
|
|
||||||
train->shuffle_next_sample = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t get_parameter_count(struct my_llama_lora* lora) {
|
static int64_t get_parameter_count(struct my_llama_lora* lora) {
|
||||||
int64_t nx = 0;
|
int64_t nx = 0;
|
||||||
nx += ggml_nelements(lora->tok_embeddings_a);
|
nx += ggml_nelements(lora->tok_embeddings_a);
|
||||||
|
@ -2023,7 +1887,7 @@ int main(int argc, char ** argv) {
|
||||||
save_data.model = &model;
|
save_data.model = &model;
|
||||||
save_data.lora = &lora;
|
save_data.lora = &lora;
|
||||||
|
|
||||||
struct opt_callback_data opt_cb_data;
|
struct train_opt_callback_data opt_cb_data;
|
||||||
opt_cb_data.params = ¶ms.common;
|
opt_cb_data.params = ¶ms.common;
|
||||||
opt_cb_data.train = train;
|
opt_cb_data.train = train;
|
||||||
opt_cb_data.save_cb = &save_train_files;
|
opt_cb_data.save_cb = &save_train_files;
|
||||||
|
@ -2057,7 +1921,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
|
ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
|
||||||
|
|
||||||
ggml_free(ctx_work);
|
ggml_free(ctx_work);
|
||||||
ggml_free(ctx_compute);
|
ggml_free(ctx_compute);
|
||||||
|
|
|
@ -919,144 +919,6 @@ static void save_train_files(void * vdata, struct train_state * train) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct opt_callback_data {
|
|
||||||
struct train_params_common * params;
|
|
||||||
struct train_state * train;
|
|
||||||
save_train_files_callback save_cb;
|
|
||||||
void * save_data;
|
|
||||||
struct llama_context * lctx;
|
|
||||||
int last_save_iter;
|
|
||||||
llama_token * tokens_data;
|
|
||||||
size_t tokens_size;
|
|
||||||
size_t * samples_begin;
|
|
||||||
size_t * samples_size;
|
|
||||||
size_t * shuffled_samples_begin;
|
|
||||||
size_t * shuffled_samples_size;
|
|
||||||
size_t samples_count;
|
|
||||||
struct ggml_tensor * tokens_input;
|
|
||||||
struct ggml_tensor * target_logits;
|
|
||||||
struct ggml_tensor * target_probs;
|
|
||||||
int first_iter;
|
|
||||||
int64_t last_time;
|
|
||||||
double millis_per_iter;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void opt_callback(void * vdata, int accum_step, float * sched) {
|
|
||||||
struct opt_callback_data * data = (struct opt_callback_data *) vdata;
|
|
||||||
struct train_params_common * params = data->params;
|
|
||||||
struct train_state * train = data->train;
|
|
||||||
struct ggml_opt_context * opt = train->opt;
|
|
||||||
int n_batch = params->n_batch;
|
|
||||||
int n_ctx = params->n_ctx;
|
|
||||||
|
|
||||||
if (accum_step == 0) {
|
|
||||||
// time measurement
|
|
||||||
int64_t now = ggml_time_ms();
|
|
||||||
if (now > data->last_time && opt->iter > data->first_iter) {
|
|
||||||
double dt = now - data->last_time;
|
|
||||||
if (data->millis_per_iter == 0.0) {
|
|
||||||
data->millis_per_iter = dt;
|
|
||||||
} else {
|
|
||||||
const double gain = 0.7;
|
|
||||||
data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
double remaining_millis = 0.0;
|
|
||||||
if (data->millis_per_iter > 0.0) {
|
|
||||||
const int n_iter = params->adam_n_iter;
|
|
||||||
const int done_iter = opt->iter - data->first_iter;
|
|
||||||
const int remaining_iter = n_iter - done_iter;
|
|
||||||
remaining_millis = remaining_iter * data->millis_per_iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
// file saving
|
|
||||||
const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
|
|
||||||
if (save_now) {
|
|
||||||
int new_iters = opt->iter - data->last_save_iter;
|
|
||||||
train->train_its += new_iters;
|
|
||||||
train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
|
|
||||||
train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
|
|
||||||
|
|
||||||
if (data->save_cb) {
|
|
||||||
data->save_cb(data->save_data, train);
|
|
||||||
}
|
|
||||||
|
|
||||||
data->last_save_iter = opt->iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
// exclude file saving from time measurement, by measuring last_time after saving
|
|
||||||
data->last_time = ggml_time_ms();
|
|
||||||
|
|
||||||
*sched = learning_schedule(
|
|
||||||
opt->iter,
|
|
||||||
params->warmup,
|
|
||||||
params->cos_decay_steps,
|
|
||||||
params->adam_alpha,
|
|
||||||
params->adam_min_alpha,
|
|
||||||
params->cos_decay_min,
|
|
||||||
params->cos_decay_restart,
|
|
||||||
params->enable_restart);
|
|
||||||
|
|
||||||
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
|
||||||
if (impr_plot > 0) impr_plot = 0;
|
|
||||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
|
||||||
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
|
||||||
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
|
||||||
*sched, opt->loss_after);
|
|
||||||
|
|
||||||
|
|
||||||
if (data->millis_per_iter > 0) {
|
|
||||||
printf(" dt=");
|
|
||||||
print_duration(data->millis_per_iter);
|
|
||||||
printf(" eta=");
|
|
||||||
print_duration(remaining_millis);
|
|
||||||
}
|
|
||||||
|
|
||||||
float improvement = opt->loss_before - opt->loss_after;
|
|
||||||
const float plot_scale = 10.0f;
|
|
||||||
int bar_len = (int)(1 + improvement*plot_scale + 0.5);
|
|
||||||
printf(" |");
|
|
||||||
for (int i=0; i<bar_len; ++i) {
|
|
||||||
printf("-");
|
|
||||||
}
|
|
||||||
printf(">");
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t used_samples = get_example_targets_batch(
|
|
||||||
data->lctx,
|
|
||||||
data->tokens_input,
|
|
||||||
data->target_probs,
|
|
||||||
train->shuffle_next_sample,
|
|
||||||
data->shuffled_samples_begin,
|
|
||||||
data->shuffled_samples_size,
|
|
||||||
data->samples_count,
|
|
||||||
data->tokens_data,
|
|
||||||
data->tokens_size,
|
|
||||||
params->separate_with_eos,
|
|
||||||
params->separate_with_bos,
|
|
||||||
params->fill_with_next_samples);
|
|
||||||
|
|
||||||
train->shuffle_next_sample += used_samples;
|
|
||||||
|
|
||||||
if (train->shuffle_next_sample >= train->shuffle_sample_count) {
|
|
||||||
++train->train_epochs;
|
|
||||||
printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
|
|
||||||
// note: we may have used some samples from the current shuffling more than once
|
|
||||||
train->shuffle_rng_state_current = train->shuffle_rng_state_next;
|
|
||||||
train->shuffle_rng_state_next = shuffle_samples(
|
|
||||||
train->shuffle_rng_state_current,
|
|
||||||
data->shuffled_samples_begin,
|
|
||||||
data->shuffled_samples_size,
|
|
||||||
data->samples_begin,
|
|
||||||
data->samples_size,
|
|
||||||
data->samples_count);
|
|
||||||
train->shuffle_next_sample = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
|
|
||||||
|
@ -1211,7 +1073,7 @@ int main(int argc, char ** argv) {
|
||||||
save_data.fn_latest = params.common.fn_latest;
|
save_data.fn_latest = params.common.fn_latest;
|
||||||
save_data.model = &model;
|
save_data.model = &model;
|
||||||
|
|
||||||
struct opt_callback_data opt_cb_data;
|
struct train_opt_callback_data opt_cb_data;
|
||||||
opt_cb_data.params = ¶ms.common;
|
opt_cb_data.params = ¶ms.common;
|
||||||
opt_cb_data.train = train;
|
opt_cb_data.train = train;
|
||||||
opt_cb_data.save_cb = &save_train_files;
|
opt_cb_data.save_cb = &save_train_files;
|
||||||
|
@ -1226,7 +1088,6 @@ int main(int argc, char ** argv) {
|
||||||
opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data();
|
opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data();
|
||||||
opt_cb_data.samples_count = train_samples_size.size();
|
opt_cb_data.samples_count = train_samples_size.size();
|
||||||
opt_cb_data.tokens_input = NULL;
|
opt_cb_data.tokens_input = NULL;
|
||||||
opt_cb_data.target_logits = NULL;
|
|
||||||
opt_cb_data.target_probs = NULL;
|
opt_cb_data.target_probs = NULL;
|
||||||
opt_cb_data.first_iter = opt->iter;
|
opt_cb_data.first_iter = opt->iter;
|
||||||
opt_cb_data.last_time = ggml_time_ms();
|
opt_cb_data.last_time = ggml_time_ms();
|
||||||
|
@ -1246,10 +1107,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_set_no_alloc(ctx0, false);
|
ggml_set_no_alloc(ctx0, false);
|
||||||
|
|
||||||
// don't use alloc for input tensors, so we can safely fill them with data
|
// don't use alloc for input tensors, so we can safely fill them with data
|
||||||
//struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
|
|
||||||
//struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
|
||||||
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
|
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
|
||||||
struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
|
||||||
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
||||||
|
|
||||||
ggml_set_no_alloc(ctx0, (alloc != NULL));
|
ggml_set_no_alloc(ctx0, (alloc != NULL));
|
||||||
|
@ -1259,7 +1117,6 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
opt_cb_data.tokens_input = tokens_input;
|
opt_cb_data.tokens_input = tokens_input;
|
||||||
opt_cb_data.target_logits = target_logits;
|
|
||||||
opt_cb_data.target_probs = target_probs;
|
opt_cb_data.target_probs = target_probs;
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
@ -1298,7 +1155,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
|
printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
|
||||||
|
|
||||||
ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
|
ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
|
||||||
|
|
||||||
size_t used_mem_after_opt = ggml_used_mem(ctx0);
|
size_t used_mem_after_opt = ggml_used_mem(ctx0);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue