From 9f4b1bf88d3e077441d8e249f2bc04f88db917b3 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 16 Sep 2023 14:58:34 +0200 Subject: [PATCH] move common train functions into common/train.[h|cpp] --- Makefile | 9 +- common/CMakeLists.txt | 2 + common/train.cpp | 914 ++++++++++++++ common/train.h | 113 ++ examples/baby-llama/baby-llama.cpp | 170 +-- examples/finetune/finetune.cpp | 1095 +++-------------- .../train-text-from-scratch.cpp | 969 +-------------- 7 files changed, 1279 insertions(+), 1993 deletions(-) create mode 100644 common/train.cpp create mode 100644 common/train.h diff --git a/Makefile b/Makefile index 63557d93e..f41ebdc69 100644 --- a/Makefile +++ b/Makefile @@ -485,6 +485,9 @@ console.o: common/console.cpp common/console.h grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h $(CXX) $(CXXFLAGS) -c $< -o $@ +train.o: common/train.cpp common/train.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) @@ -532,7 +535,7 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) +train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) @@ -541,13 +544,13 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) +baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o $(OBJS) +finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index dead56118..951aa8340 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT console.cpp grammar-parser.h grammar-parser.cpp + train.h + train.cpp ) if (BUILD_SHARED_LIBS) diff --git a/common/train.cpp b/common/train.cpp new file mode 100644 index 000000000..a1e35e5a3 --- /dev/null +++ b/common/train.cpp @@ -0,0 +1,914 @@ +#include "train.h" +#include "common.h" + +#include +#include +#include + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution rd; + float min; + float max; +}; + +struct random_uniform_distribution { + std::mt19937 gen; + std::uniform_real_distribution rd; +}; + +struct random_normal_distribution * init_random_normal_distribution(int seed, float mean, float std, float min, float max) { + struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; + return rnd; +} + +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) { + struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::uniform_real_distribution{min, max}; + return rnd; +} + +void free_random_normal_distribution (struct random_normal_distribution * rnd) { + free(rnd); +} + +void free_random_uniform_distribution(struct random_uniform_distribution * rnd) { + free(rnd); +} + +struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + float scale = 1.0f; // xavier + switch (tensor->n_dims) { + case 1: + scale /= sqrtf((float) tensor->ne[0]); + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = scale * frand_normal(rnd); + } + break; + case 2: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = scale * frand_normal(rnd); + } + } + break; + case 3: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = scale * frand_normal(rnd); + } + } + } + break; + case 4: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = scale * frand_normal(rnd); + } + } + } + } + break; + default: + GGML_ASSERT(!"Unsupported tensor->n_dims"); + }; + return tensor; +} + +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_uniform(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_uniform(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_uniform(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_uniform(rnd); + } + } + } + } + break; + default: + GGML_ASSERT(!"Unsupported tensor->n_dims"); + }; + return tensor; +} + +float frand() { + return (float)rand()/(float)RAND_MAX; +} + +float frand_normal(struct random_normal_distribution * rnd) { + return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); +} + +float frand_uniform(struct random_uniform_distribution * rnd) { + return rnd->rd(rnd->gen); +} + +int clamp(const int v, const int min, const int max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float fclamp(const float v, const float min, const float max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->n_dims == 1); + GGML_ASSERT(tensor->ne[0] == ne0); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->n_dims == 2); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->n_dims == 3); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->n_dims == 4); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples) { + + GGML_ASSERT(tokens_input->n_dims == 2); + GGML_ASSERT(target_probs->n_dims == 3); + int64_t n_vocab = target_probs->ne[0]; + int64_t n_tokens = tokens_input->ne[0]; + int64_t n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_vocab == target_probs->ne[0]); + GGML_ASSERT(n_tokens == target_probs->ne[1]); + GGML_ASSERT(n_batch == target_probs->ne[2]); + + int64_t used_samples = 0; + + ggml_set_f32(target_probs, 0.0f); + llama_token bos = llama_token_bos(lctx); + llama_token eos = llama_token_eos(lctx); + // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); + for (int k=0; k= sample_size && fill_with_next_samples) { + if (!sample_separation_eos) { + // insert eos token to separate samples + sample_separation_eos = true; + } else if (!sample_separation_bos) { + // insert bos token to separate samples + sample_separation_bos = true; + token = bos; + } else { + // sample separation is done, continue with next sample + sample_separation_eos = !separate_with_eos; + sample_separation_bos = !separate_with_bos; + sample_offs = 0; + sample_idx = (example_id + used_samples) % samples_count; + sample_begin = samples_begin[sample_idx]; + sample_size = samples_size[sample_idx]; + ++used_samples; + } + } + // note: no else-if here + if (sample_offs < sample_size) { + token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); + ++sample_offs; + } + ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); + if (i+1> rng; +} + +std::string mt19937_get_state(const std::mt19937& rng) { + std::stringstream s_rng_state; + s_rng_state.imbue(std::locale::classic()); + s_rng_state << rng; + return s_rng_state.str(); +} + +std::string mt19937_seed_to_state(unsigned seed) { + std::mt19937 rng(seed); + return mt19937_get_state(rng); +} + +std::string shuffle_samples( + const std::string & rng_state, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count) { + if (count == 0) return rng_state; + + std::mt19937 rng; + mt19937_set_state(rng, rng_state); + + // sort indices by random value for each index + std::vector idcs; + { + std::vector rnd; + idcs.resize(count); + rnd.resize(count); + for (unsigned i=0; i h_string; + std::hash h_ull; + size_t h = h_string(std::string(fn)); + h = hash_combine(h, h_ull((unsigned long long) sample_count)); + for (size_t i=0; i< sample_count; ++i) { + h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); + h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); + } + return h; +} + +std::string replace_str(const char * s, const char * needle, const char * replacement) { + std::string str = s; + size_t pos = str.find(needle); + if (pos != std::string::npos) { + str.replace(pos, strlen(needle), replacement); + } + return str; +} + +void print_duration(double fmillis) { + if (fmillis < 1000.0f) { + printf("%.1fms", (float) fmillis); + return; + } + const int64_t one_sec = 1000; + const int64_t one_min = one_sec * 60; + const int64_t one_hour = one_min * 60; + const int64_t one_day = one_hour * 24; + + int64_t millis = (int64_t) fmillis; + int64_t days = millis/one_day; + int64_t hours = (millis - days*one_day)/one_hour; + int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; + int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; + + // to print int64_t either cast to (long long int) or use macro PRId64 from + if (days > 0) { + printf("%lldd ", (long long int) days); + } + printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); +} + +float cosine_decay(int64_t step, int64_t decay_steps, float minimum) { + if (step > decay_steps) { + step = decay_steps; + } + const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); + const float decay = (1 - minimum)*cosine_decay + minimum; + return decay; +} + +float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int64_t) (restart_step_mult * decay_steps); + } + return cosine_decay(step, decay_steps, minimum); +} + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t cos_decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart) { + + float result = + (step < warmup_steps) + ? (float) step / (float) warmup_steps + : enable_restart + ? cosine_decay_restart( + step - warmup_steps, + cos_decay_steps, + cos_decay_minimum, + cos_decay_restart_step_mult) + : cosine_decay( + step, + cos_decay_steps, + cos_decay_minimum); + + float min = overall_minimum / learning_rate; + result = min + result * (1.0f - min); + return result; +} + +static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a != NULL); + GGML_ASSERT(b != NULL); + GGML_ASSERT(a->type == b->type); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); + + return true; +} + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { + if (dst == NULL) { + return; + } + struct ggml_tensor * t = ggml_get_tensor(ctx, name); + GGML_ASSERT(are_same_layout(dst, t)); + memcpy(dst->data, t->data, ggml_nbytes(t)); + + if (strlen(ggml_get_name(dst)) == 0) { + ggml_set_name(dst, name); + } +} + +// gguf constants +static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; +static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; +static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; +static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; +static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; +static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; +static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; +static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; +static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; +static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; + +static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; + +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); + GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); + GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); + + uint64_t nx; + GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); + opt->nx = (size_t) nx; + + // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know + + std::string opt_type; + GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); + if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { + opt->params.type = GGML_OPT_ADAM; + + GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); + GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { + opt->params.type = GGML_OPT_LBFGS; + + GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); + GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); + GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); + GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); + GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); + GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + copy_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + copy_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + copy_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + } else { + throw std::runtime_error("unknown optimizer type\n"); + } +} + +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); + gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); + gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); + + ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + if (opt->adam.pf) { + ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } + + gguf_add_tensor(fctx, opt->adam.m); + gguf_add_tensor(fctx, opt->adam.v); + if (opt->adam.pf) { + gguf_add_tensor(fctx, opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); + + ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + if (opt->lbfgs.pf) { + ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + } + ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + + gguf_add_tensor(fctx, opt->lbfgs.x); + gguf_add_tensor(fctx, opt->lbfgs.xp); + gguf_add_tensor(fctx, opt->lbfgs.g); + gguf_add_tensor(fctx, opt->lbfgs.gp); + gguf_add_tensor(fctx, opt->lbfgs.d); + if (opt->lbfgs.pf) { + gguf_add_tensor(fctx, opt->lbfgs.pf); + } + gguf_add_tensor(fctx, opt->lbfgs.lmal); + gguf_add_tensor(fctx, opt->lbfgs.lmys); + gguf_add_tensor(fctx, opt->lbfgs.lms); + gguf_add_tensor(fctx, opt->lbfgs.lmy); + } break; + } +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die_fmt("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +// mark each byte with its utf8 unit number. +// returns the number of utf8 characters. +// e.g. when bytes == '\x61\xD0\xB0\x62', +// then utf8_units will become [0,0,1,0] +// utf8_nunits will become [1,2,2,1] and 3 is returned. +// bytes where utf8_units is zero, are the begin of an utf8 character. +static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { + size_t offs = 0; + size_t count_utf8 = 0; + while(offs < count) { + int len = (int) utf8_len(bytes[offs]); + for (int i=0; i & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size) { + struct llama_file f(filename, "rb"); + + if (f.size == 0) { + out_tokens.clear(); + out_samples_begin.clear(); + out_samples_size.clear(); + printf("%s: warning: empty or not existing training data file '%s'\n", + __func__, filename); + return out_tokens.size(); + } + + // account for possible leading whitespace that will be added by tokenizer + // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] + const int n_max_tokens_overhead = 1; + + std::vector buf; + buf.resize(f.size+1); + + f.read_raw(buf.data(), f.size); + buf[f.size] = '\0'; + + std::vector utf8_units; + std::vector utf8_nunits; + utf8_units.resize(buf.size()); + utf8_nunits.resize(buf.size()); + size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); + + if (sample_start.size() == 0) { + // tokenize all data at once + out_tokens.resize(buf.size() + n_max_tokens_overhead); + + int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false); + if (n_tokens < 0) { + out_tokens.resize(-n_tokens); + n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false); + } + if (n_tokens >= 0) { + out_tokens.resize(n_tokens); + } + + // generate sample starts at all token positions + out_samples_begin.clear(); + out_samples_begin.push_back(0); + out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); + size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; + for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { + out_samples_begin.push_back(sample_begin); + out_samples_size.push_back(context_length); + } + } else { + // split data into samples and tokenize each sample + std::string data_str(buf.data(), buf.size()-1); + out_samples_begin.clear(); + out_samples_size.clear(); + out_tokens.clear(); + + // find all positions of pattern sample_start + size_t sample_begin = data_str.find(sample_start, 0); + while (sample_begin != std::string::npos) { + out_samples_begin.push_back(sample_begin); + const size_t search_start = sample_begin + sample_start.size(); + sample_begin = data_str.find(sample_start, search_start); + } + if (out_samples_begin.size() == 0) { + printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", + __func__, sample_start.c_str()); + out_samples_begin.push_back(0); + } + + out_samples_size.resize(out_samples_begin.size(), 0); + + std::vector buf_sample; + std::vector tok_sample; + + const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); + size_t found_too_big_sample = 0; + size_t found_too_small_sample = 0; + size_t found_empty_sample = 0; + size_t found_min_sample_size = SIZE_MAX; + size_t found_max_sample_size = 0; + + size_t max_token_text_size = 0; + int n_vocab = llama_n_vocab(lctx); + for (llama_token token=0; token < n_vocab; ++token) { + max_token_text_size = std::max( + max_token_text_size, + strlen(llama_token_get_text(lctx, token))); + } + + // upper bound of context byte length. + // strings with this byte length should always tokenize to at least context_length tokens. + size_t context_byte_len = max_token_text_size*context_length; + + for (unsigned i=0; i 0) { + // sample end is in the middle of an utf8 character. + // advance sample_end to the begin of the next utf8 character. + sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; + } + size_t sample_size = sample_end - sample_begin; + if (sample_size == 0) { + ++found_empty_sample; + } + + if (sample_size > 0) { + // llama_tokenize expects zero terminated string, + // copy sample into buffer and zero terminate it. + buf_sample.resize(sample_size+1); + memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); + buf_sample[sample_size] = '\0'; + + // printf("sample: '%s'\n", buf_sample.data()); + + // tokenize the sample + tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); + int n_tokens = llama_tokenize(lctx, + buf_sample.data(), + tok_sample.data(), + (int) tok_sample.size(), false); + if (n_tokens < 0) { + tok_sample.resize(-n_tokens); + n_tokens = llama_tokenize(lctx, + buf_sample.data(), + tok_sample.data(), + (int) tok_sample.size(), false); + GGML_ASSERT(n_tokens >= 0); + } + GGML_ASSERT(n_tokens <= (int) tok_sample.size()); + + if ((size_t) n_tokens > context_length) { + ++found_too_big_sample; + } else if ((size_t) n_tokens < context_length) { + ++found_too_small_sample; + } + found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); + found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); + + // write out tokens, start and size of sample + // overwrite the string start position with the token start position + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = (size_t) n_tokens; + out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); + } else { + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = 0; + } + + } + if (found_too_big_sample > 0) { + printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", + __func__, found_too_big_sample, found_max_sample_size, context_length); + } + + if (found_too_small_sample > 0) { + printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", + __func__, found_too_small_sample, found_min_sample_size, context_length); + } + + if (found_empty_sample) { + printf("%s: warning: found %zu empty samples.\n", + __func__, found_empty_sample); + } + } + printf("%s: total number of samples: %zu\n", + __func__, out_samples_begin.size()); + + GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); + + return out_tokens.size(); +} diff --git a/common/train.h b/common/train.h new file mode 100644 index 000000000..9d629beb7 --- /dev/null +++ b/common/train.h @@ -0,0 +1,113 @@ +// Various helper functions and utilities for training + +#pragma once + +#include +#include +#include + +#include "ggml.h" +#include "llama.h" + +struct random_normal_distribution; +struct random_uniform_distribution; + +struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max); +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max); + +void free_random_normal_distribution (struct random_normal_distribution * rnd); +void free_random_uniform_distribution(struct random_uniform_distribution * rnd); + +struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); + +float frand(); +float frand_normal (struct random_normal_distribution * rnd); +float frand_uniform(struct random_uniform_distribution * rnd); + +int clamp (const int v, const int min, const int max); +float fclamp(const float v, const float min, const float max); + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0); +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1); +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2); +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); + +size_t tokenize_file( + struct llama_context * lctx, + const char * filename, + const std::string & sample_start, + bool include_sample_start, + bool overlapping_samples, + unsigned context_length, + std::vector & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size); + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples); + +typedef std::string mt19937_state; + +void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state); +mt19937_state mt19937_get_state(const std::mt19937& rng); +mt19937_state mt19937_seed_to_state(unsigned seed); + +mt19937_state shuffle_samples( + const mt19937_state & rng_state, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count); + +size_t hash_combine(size_t h1, size_t h2); + +size_t compute_samples_hash( + const char* fn, + const size_t* samples_begin, + const size_t* samples_size, + size_t sample_count); + + +std::string replace_str(const char * s, const char * needle, const char * replacement); + +void print_duration(double milliseconds); + +float cosine_decay( + int64_t step, + int64_t decay_steps, + float minimum); + +float cosine_decay_restart( + int64_t step, + int64_t decay_steps, + float minimum, + float restart_step_mult); + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart); + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name); + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt); +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt); + diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index a99ece9a6..e7a8e4577 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "train.h" #include #include #include @@ -14,29 +15,6 @@ static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; static const float rms_norm_eps = 5e-6f; #endif -float frand() { - return (float)rand()/(float)RAND_MAX; -} - -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution nd; - float min; - float max; -}; - -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->nd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; -} - -float frand_normal(struct random_normal_distribution * rnd) { - const float r = rnd->nd(rnd->gen); - return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); -} - void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); @@ -95,56 +73,6 @@ struct ggml_tensor * randomize_tensor( return tensor; } -struct ggml_tensor * randomize_tensor_normal( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - struct random_normal_distribution * rnd) { - float scale = 1.0; // xavier - switch (ndims) { - case 1: - scale /= sqrtf(ne[0]); - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i0] = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(ne[0]+ne[1]); - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(ne[0]+ne[1]); - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(ne[0]+ne[1]); - for (int i3 = 0; i3 < ne[3]; i3++) { - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; - - return tensor; -} - struct llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? @@ -402,27 +330,29 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); - randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); - randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings , rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->output , rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); - randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd); - randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd); - randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd); - randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd); + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); - randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); + randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); - randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); - randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } + + free_random_normal_distribution(rnd); } @@ -431,32 +361,34 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); - randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); - randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd); - randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->outputa , rnd); + randomize_tensor_normal(model->outputb , rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); - randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd); - randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd); - randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd); - randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd); - randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd); - randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd); - randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd); - randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd); + randomize_tensor_normal(layer.wqa, rnd); + randomize_tensor_normal(layer.wqb, rnd); + randomize_tensor_normal(layer.wka, rnd); + randomize_tensor_normal(layer.wkb, rnd); + randomize_tensor_normal(layer.wva, rnd); + randomize_tensor_normal(layer.wvb, rnd); + randomize_tensor_normal(layer.woa, rnd); + randomize_tensor_normal(layer.wob, rnd); - randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); + randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); - randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); - randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } + + free_random_normal_distribution(rnd); } bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { @@ -756,32 +688,6 @@ struct ggml_tensor * forward( return inpL; } -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} - -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); -} - -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); -} - struct ggml_tensor * forward_batch( struct llama_model * model, struct llama_kv_cache * cache, diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 623e1bcad..ce6f28bad 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -2,6 +2,7 @@ #include "ggml-alloc.h" #include "llama.h" #include "common.h" +#include "train.h" #include #include #include @@ -13,7 +14,6 @@ #include #include #include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -21,143 +21,6 @@ static const size_t tensor_alignment = 32; -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution rd; - float min; - float max; -}; - -struct random_uniform_distribution { - std::mt19937 gen; - std::uniform_real_distribution rd; -}; - -static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; -} - -static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution{min, max}; -} - -static int clamp(const int v, const int min, const int max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -static float fclamp(const float v, const float min, const float max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -static float frand() { - return (float)rand()/(float)RAND_MAX; -} - -static float frand_normal(struct random_normal_distribution * rnd) { - return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); -} - -static float frand_uniform(struct random_uniform_distribution * rnd) { - return rnd->rd(rnd->gen); -} - -static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { - float scale = 1.0f; // xavier - switch (tensor->n_dims) { - case 1: - scale /= sqrtf(tensor->ne[0]); - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - -static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_uniform(rnd); - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_uniform(rnd); - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_uniform(rnd); - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_uniform(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -299,40 +162,6 @@ struct my_llama_lora { }; // gguf constants -static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; -static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; -static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; -static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; -static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; -static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; -static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; -static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; -static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; -static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; - -static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; - -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; - static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; static const char * LLM_KV_TRAINING_TYPE = "training.type"; @@ -719,66 +548,41 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { const uint32_t n_layer = lora->layers.size(); - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - randomize_tensor_normal(lora->tok_embeddings_a, &rnd); - randomize_tensor_normal(lora->tok_embeddings_b, &rnd); - randomize_tensor_normal(lora->norm_a, &rnd); - randomize_tensor_normal(lora->norm_b, &rnd); - randomize_tensor_normal(lora->output_a, &rnd); - randomize_tensor_normal(lora->output_b, &rnd); + randomize_tensor_normal(lora->tok_embeddings_a, rnd); + randomize_tensor_normal(lora->tok_embeddings_b, rnd); + randomize_tensor_normal(lora->norm_a, rnd); + randomize_tensor_normal(lora->norm_b, rnd); + randomize_tensor_normal(lora->output_a, rnd); + randomize_tensor_normal(lora->output_b, rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = lora->layers[i]; - randomize_tensor_normal(layer.attention_norm_a, &rnd); - randomize_tensor_normal(layer.attention_norm_b, &rnd); + randomize_tensor_normal(layer.attention_norm_a, rnd); + randomize_tensor_normal(layer.attention_norm_b, rnd); - randomize_tensor_normal(layer.wq_a, &rnd); - randomize_tensor_normal(layer.wq_b, &rnd); - randomize_tensor_normal(layer.wk_a, &rnd); - randomize_tensor_normal(layer.wk_b, &rnd); - randomize_tensor_normal(layer.wv_a, &rnd); - randomize_tensor_normal(layer.wv_b, &rnd); - randomize_tensor_normal(layer.wo_a, &rnd); - randomize_tensor_normal(layer.wo_b, &rnd); + randomize_tensor_normal(layer.wq_a, rnd); + randomize_tensor_normal(layer.wq_b, rnd); + randomize_tensor_normal(layer.wk_a, rnd); + randomize_tensor_normal(layer.wk_b, rnd); + randomize_tensor_normal(layer.wv_a, rnd); + randomize_tensor_normal(layer.wv_b, rnd); + randomize_tensor_normal(layer.wo_a, rnd); + randomize_tensor_normal(layer.wo_b, rnd); - randomize_tensor_normal(layer.ffn_norm_a, &rnd); - randomize_tensor_normal(layer.ffn_norm_b, &rnd); + randomize_tensor_normal(layer.ffn_norm_a, rnd); + randomize_tensor_normal(layer.ffn_norm_b, rnd); - randomize_tensor_normal(layer.w1_a, &rnd); - randomize_tensor_normal(layer.w1_b, &rnd); - randomize_tensor_normal(layer.w2_a, &rnd); - randomize_tensor_normal(layer.w2_b, &rnd); - randomize_tensor_normal(layer.w3_a, &rnd); - randomize_tensor_normal(layer.w3_b, &rnd); + randomize_tensor_normal(layer.w1_a, rnd); + randomize_tensor_normal(layer.w1_b, rnd); + randomize_tensor_normal(layer.w2_a, rnd); + randomize_tensor_normal(layer.w2_b, rnd); + randomize_tensor_normal(layer.w3_a, rnd); + randomize_tensor_normal(layer.w3_b, rnd); } -} -static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} - -static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); -} - -static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); + free_random_normal_distribution(rnd); } static struct ggml_tensor * llama_build_lora_finetune_graphs( @@ -1019,476 +823,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( return t36; } -static int get_example_targets_batch( - struct llama_context * lctx, - const size_t * samples_begin, - const size_t * samples_size, - size_t samples_count, - const llama_token * train_data, - size_t n_train_data, - int example_id, - struct ggml_tensor * tokens_input, - struct ggml_tensor * target_probs, - bool separate_with_eos, - bool separate_with_bos, - bool fill_with_next_samples) { - - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_probs->n_dims == 3); - int n_vocab = target_probs->ne[0]; - int n_tokens = tokens_input->ne[0]; - int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_vocab == target_probs->ne[0]); - GGML_ASSERT(n_tokens == target_probs->ne[1]); - GGML_ASSERT(n_batch == target_probs->ne[2]); - - int used_samples = 0; - - ggml_set_f32(target_probs, 0.0f); - int bos = llama_token_bos(lctx); - int eos = llama_token_eos(lctx); - // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); - for (int k=0; k= sample_size && fill_with_next_samples) { - if (!sample_separation_eos) { - // insert eos token to separate samples - sample_separation_eos = true; - } else if (!sample_separation_bos) { - // insert bos token to separate samples - sample_separation_bos = true; - token = bos; - } else { - // sample separation is done, continue with next sample - sample_separation_eos = !separate_with_eos; - sample_separation_bos = !separate_with_bos; - sample_offs = 0; - sample_idx = (example_id + used_samples) % samples_count; - sample_begin = samples_begin[sample_idx]; - sample_size = samples_size[sample_idx]; - ++used_samples; - } - } - // note: no else-if here - if (sample_offs < sample_size) { - token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1); - ++sample_offs; - } - ggml_set_f32_nd(target_probs, token, i, k, 0, +1.0f); - if (i+1= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} - -// mark each byte with its utf8 unit number. -// returns the number of utf8 characters. -// e.g. when bytes == '\x61\xD0\xB0\x62', -// then utf8_units will become [0,0,1,0] -// utf8_nunits will become [1,2,2,1] and 3 is returned. -// bytes where utf8_units is zero, are the begin of an utf8 character. -static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { - size_t offs = 0; - size_t count_utf8 = 0; - while(offs < count) { - size_t len = utf8_len(bytes[offs]); - for (size_t i=0; i & out_tokens, - std::vector & out_samples_begin, - std::vector & out_samples_size) { - struct llama_file f(filename, "rb"); - - if (f.size == 0) { - out_tokens.clear(); - out_samples_begin.clear(); - out_samples_size.clear(); - printf("%s: warning: empty or not existing training data file '%s'\n", - __func__, filename); - return out_tokens.size(); - } - - // account for possible leading whitespace that will be added by tokenizer - // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] - const int n_max_tokens_overhead = 1; - - std::vector buf; - buf.resize(f.size+1); - - f.read_raw(buf.data(), f.size); - buf[f.size] = '\0'; - - std::vector utf8_units; - std::vector utf8_nunits; - utf8_units.resize(buf.size()); - utf8_nunits.resize(buf.size()); - size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); - - if (sample_start.size() == 0) { - // tokenize all data at once - out_tokens.resize(buf.size() + n_max_tokens_overhead); - - int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false); - if (n_tokens < 0) { - out_tokens.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false); - } - if (n_tokens >= 0) { - out_tokens.resize(n_tokens); - } - - // generate sample starts at all token positions - out_samples_begin.clear(); - out_samples_begin.push_back(0); - out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); - size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; - for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { - out_samples_begin.push_back(sample_begin); - out_samples_size.push_back(context_length); - } - } else { - // split data into samples and tokenize each sample - std::string data_str(buf.data(), buf.size()-1); - out_samples_begin.clear(); - out_samples_size.clear(); - out_tokens.clear(); - - // find all positions of pattern sample_start - size_t sample_begin = data_str.find(sample_start, 0); - while (sample_begin != std::string::npos) { - out_samples_begin.push_back(sample_begin); - const size_t search_start = sample_begin + sample_start.size(); - sample_begin = data_str.find(sample_start, search_start); - } - if (out_samples_begin.size() == 0) { - printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", - __func__, sample_start.c_str()); - out_samples_begin.push_back(0); - } - - out_samples_size.resize(out_samples_begin.size(), 0); - - std::vector buf_sample; - std::vector tok_sample; - - const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); - size_t found_too_big_sample = 0; - size_t found_too_small_sample = 0; - size_t found_empty_sample = 0; - size_t found_min_sample_size = SIZE_MAX; - size_t found_max_sample_size = 0; - - size_t max_token_text_size = 0; - int n_vocab = llama_n_vocab(lctx); - for (llama_token token=0; token < n_vocab; ++token) { - max_token_text_size = std::max( - max_token_text_size, - strlen(llama_token_get_text(lctx, token))); - } - - // upper bound of context byte length. - // strings with this byte length should always tokenize to at least context_length tokens. - size_t context_byte_len = max_token_text_size*context_length; - - for (unsigned i=0; i 0) { - // sample end is in the middle of an utf8 character. - // advance sample_end to the begin of the next utf8 character. - sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; - } - size_t sample_size = sample_end - sample_begin; - if (sample_size == 0) { - ++found_empty_sample; - } - - if (sample_size > 0) { - // llama_tokenize expects zero terminated string, - // copy sample into buffer and zero terminate it. - buf_sample.resize(sample_size+1); - memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); - buf_sample[sample_size] = '\0'; - - // printf("sample: '%s'\n", buf_sample.data()); - - // tokenize the sample - tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize(lctx, - buf_sample.data(), - tok_sample.data(), - tok_sample.size(), false); - if (n_tokens < 0) { - tok_sample.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, - buf_sample.data(), - tok_sample.data(), - tok_sample.size(), false); - GGML_ASSERT(n_tokens >= 0); - } - GGML_ASSERT(n_tokens <= tok_sample.size()); - - if ((size_t) n_tokens > context_length) { - ++found_too_big_sample; - } else if ((size_t) n_tokens < context_length) { - ++found_too_small_sample; - } - found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); - found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); - - // write out tokens, start and size of sample - // overwrite the string start position with the token start position - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = (size_t) n_tokens; - out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); - } else { - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = 0; - } - - } - if (found_too_big_sample > 0) { - printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", - __func__, found_too_big_sample, found_max_sample_size, context_length); - } - - if (found_too_small_sample > 0) { - printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", - __func__, found_too_small_sample, found_min_sample_size, context_length); - } - - if (found_empty_sample) { - printf("%s: warning: found %zu empty samples.\n", - __func__, found_empty_sample); - } - } - printf("%s: total number of samples: %zu\n", - __func__, out_samples_begin.size()); - - GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); - - return out_tokens.size(); -} - -static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) { - std::stringstream s_rng_state; - s_rng_state.imbue(std::locale::classic()); - s_rng_state.exceptions(std::stringstream::failbit); - s_rng_state.str(rng_state); - s_rng_state >> rng; -} - -static std::string mt19937_get_state(const std::mt19937& rng) { - std::stringstream s_rng_state; - s_rng_state.imbue(std::locale::classic()); - s_rng_state << rng; - return s_rng_state.str(); -} - -static std::string mt19937_seed_to_state(unsigned seed) { - std::mt19937 rng(seed); - return mt19937_get_state(rng); -} - -static std::string shuffle_samples( - const std::string & rng_state, - const size_t * begins, - const size_t * sizes, - size_t * shuffled_begins, - size_t * shuffled_sizes, - size_t count) { - if (count == 0) return rng_state; - - std::mt19937 rng; - mt19937_set_state(rng, rng_state); - - // sort indices by random value for each index - std::vector idcs; - { - std::vector rnd; - idcs.resize(count); - rnd.resize(count); - for (unsigned i=0; i= 0) { \ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ } \ (dst) = func(ctx, kid); \ } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ + die_fmt("key not found in model: %s", skey.c_str()); \ } \ } -static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(a != NULL); - GGML_ASSERT(b != NULL); - GGML_ASSERT(a->type == b->type); - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - - return true; -} - -static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { - if (dst == NULL) { - return; - } - struct ggml_tensor * t = ggml_get_tensor(ctx, name); - GGML_ASSERT(are_same_layout(dst, t)); - memcpy(dst->data, t->data, ggml_nbytes(t)); - - if (strlen(ggml_get_name(dst)) == 0) { - ggml_set_name(dst, name); - } -} - static void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) { if (strlen(fn_base_model) == 0) { return; @@ -1558,131 +869,7 @@ static void load_default_lora_params_from_base_model(const char * fn_base_model, gguf_free(fctx); } -static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); - GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); - GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); - - uint64_t nx; - GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); - opt->nx = (size_t) nx; - - // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know - - std::string opt_type; - GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); - if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; - - GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); - GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); - - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; - - GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); - GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); - GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); - GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); - GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); - GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); - - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - } else { - throw std::runtime_error("unknown optimizer type\n"); - } -} - -static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); - gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); - gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); - - ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - if (opt->adam.pf) { - ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } - - gguf_add_tensor(fctx, opt->adam.m); - gguf_add_tensor(fctx, opt->adam.v); - if (opt->adam.pf) { - gguf_add_tensor(fctx, opt->adam.pf); - } - } break; - case GGML_OPT_LBFGS: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); - - ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - if (opt->lbfgs.pf) { - ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - } - ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - - gguf_add_tensor(fctx, opt->lbfgs.x); - gguf_add_tensor(fctx, opt->lbfgs.xp); - gguf_add_tensor(fctx, opt->lbfgs.g); - gguf_add_tensor(fctx, opt->lbfgs.gp); - gguf_add_tensor(fctx, opt->lbfgs.d); - if (opt->lbfgs.pf) { - gguf_add_tensor(fctx, opt->lbfgs.pf); - } - gguf_add_tensor(fctx, opt->lbfgs.lmal); - gguf_add_tensor(fctx, opt->lbfgs.lmys); - gguf_add_tensor(fctx, opt->lbfgs.lms); - gguf_add_tensor(fctx, opt->lbfgs.lmy); - } break; - } -} static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read @@ -1737,33 +924,33 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context init_lora(model, lora); - read_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); - read_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); - read_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); - read_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); - read_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); - read_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); + copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); + copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); + copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); + copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); + copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); + copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); for (uint32_t i = 0; i < lora->layers.size(); ++i) { auto & layer = lora->layers[i]; - read_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); - read_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); - read_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); - read_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); - read_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); - read_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); - read_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); - read_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); - read_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); - read_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); - read_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); - read_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - read_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); - read_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); - read_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); - read_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); - read_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); - read_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); + copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); + copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); + copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); + copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); + copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); + copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); + copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); + copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); + copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); + copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); + copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); + copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); + copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); + copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); + copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); + copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); + copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); } } @@ -1915,6 +1102,89 @@ static void save_checkpoint_lora_file(const char * filename, struct my_llama_mod gguf_free(fctx); } +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die_fmt("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { if (tensor == NULL) { file->write_u32(0); @@ -2002,25 +1272,6 @@ static void save_as_llama_lora(struct my_llama_lora * lora, const char * filenam } } -static float cosine_decay(const int decay_steps, const float minimum, int step) { - if (step > decay_steps) { - step = decay_steps; - } - const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - minimum)*cosine_decay + minimum; - return decay; -} - -static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { - if (enable_restart) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; - } - } - return cosine_decay(decay_steps, minimum, step); -} - struct train_params { const char * fn_model_base; const char * fn_train_data; @@ -2665,29 +1916,6 @@ struct opt_callback_data { double millis_per_iter; }; -static void print_duration(double fmillis) { - if (fmillis < 1000.0f) { - printf("%.1fms", (float) fmillis); - return; - } - const int64_t one_sec = 1000; - const int64_t one_min = one_sec * 60; - const int64_t one_hour = one_min * 60; - const int64_t one_day = one_hour * 24; - - int64_t millis = (int64_t) fmillis; - int64_t days = millis/one_day; - int64_t hours = (millis - days*one_day)/one_hour; - int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; - int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; - - // to print int64_t either cast to (long long int) or use macro PRId64 from - if (days > 0) { - printf("%lldd ", (long long int) days); - } - printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); -} - static void opt_callback(void * vdata, int accum_step, float * sched) { struct opt_callback_data * data = (struct opt_callback_data *) vdata; struct train_params * params = data->params; @@ -2738,16 +1966,15 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { // exclude file saving from time measurement, by measuring last_time after saving data->last_time = ggml_time_ms(); - *sched = (opt->iter < params->warmup) - ? (float) opt->iter / (float) params->warmup - : cosine_decay_restart( - params->cos_decay_steps, - params->cos_decay_min, - opt->iter - params->warmup, - params->cos_decay_restart, - params->enable_restart); - float min_sched = params->adam_min_alpha / params->adam_alpha; - *sched = min_sched + *sched * (1.0f - min_sched); + *sched = learning_schedule( + opt->iter, + params->warmup, + params->cos_decay_steps, + params->adam_alpha, + params->adam_min_alpha, + params->cos_decay_min, + params->cos_decay_restart, + params->enable_restart); int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); if (impr_plot > 0) impr_plot = 0; @@ -2775,16 +2002,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { printf("\n"); } - int used_samples = get_example_targets_batch( + int64_t used_samples = get_example_targets_batch( data->lctx, + data->tokens_input, + data->target_probs, + data->lora->shuffle_next_sample, data->shuffled_samples_begin, data->shuffled_samples_size, data->samples_count, data->tokens_data, data->tokens_size, - data->lora->shuffle_next_sample, - data->tokens_input, - data->target_probs, params->separate_with_eos, params->separate_with_bos, params->fill_with_next_samples); @@ -2798,10 +2025,10 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { data->lora->shuffle_rng_state_current = data->lora->shuffle_rng_state_next; data->lora->shuffle_rng_state_next = shuffle_samples( data->lora->shuffle_rng_state_current, - data->samples_begin, - data->samples_size, data->shuffled_samples_begin, data->shuffled_samples_size, + data->samples_begin, + data->samples_size, data->samples_count); data->lora->shuffle_next_sample = 0; } @@ -2840,22 +2067,6 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) { return nx; } -static size_t hash_combine(size_t h1, size_t h2) { - return h1 ^ (h2 << 1); -} - -static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) { - std::hash h_string; - std::hash h_ull; - size_t h = h_string(std::string(fn)); - h = hash_combine(h, h_ull((unsigned long long) sample_count)); - for (size_t i=0; i< sample_count; ++i) { - h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); - h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); - } - return h; -} - int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); @@ -3176,10 +2387,10 @@ int main(int argc, char ** argv) { train_shuffled_samples_size.resize(train_samples_size.size()); lora.shuffle_rng_state_next = shuffle_samples( lora.shuffle_rng_state_current, - train_samples_begin.data(), - train_samples_size.data(), train_shuffled_samples_begin.data(), train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), train_samples_size.size()); printf("%s: begin training\n", __func__); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 39a85967a..63edcf9ef 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "common.h" +#include "train.h" #include "llama.h" #include #include @@ -18,143 +19,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution rd; - float min; - float max; -}; - -struct random_uniform_distribution { - std::mt19937 gen; - std::uniform_real_distribution rd; -}; - -static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; -} - -static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution{min, max}; -} - -static int clamp(const int v, const int min, const int max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -static float fclamp(const float v, const float min, const float max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -static float frand() { - return (float)rand()/(float)RAND_MAX; -} - -static float frand_normal(struct random_normal_distribution * rnd) { - return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); -} - -static float frand_uniform(struct random_uniform_distribution * rnd) { - return rnd->rd(rnd->gen); -} - -static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { - float scale = 1.0f; // xavier - switch (tensor->n_dims) { - case 1: - scale /= sqrtf(tensor->ne[0]); - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - -static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_uniform(rnd); - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_uniform(rnd); - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_uniform(rnd); - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_uniform(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -215,40 +79,6 @@ struct my_llama_model { }; // gguf constants -static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; -static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; -static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; -static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; -static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; -static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; -static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; -static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; -static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; -static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; - -static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; - -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; - static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; static const char * LLM_KV_TRAINING_TYPE = "training.type"; @@ -411,54 +241,29 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean, const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, &rnd); - randomize_tensor_normal(model->norm, &rnd); - randomize_tensor_normal(model->output, &rnd); + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm, rnd); + randomize_tensor_normal(model->output, rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); - randomize_tensor_normal(layer.wq, &rnd); - randomize_tensor_normal(layer.wk, &rnd); - randomize_tensor_normal(layer.wv, &rnd); - randomize_tensor_normal(layer.wo, &rnd); + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); - randomize_tensor_normal(layer.ffn_norm, &rnd); + randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, &rnd); - randomize_tensor_normal(layer.w2, &rnd); - randomize_tensor_normal(layer.w3, &rnd); + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } -} -static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} - -static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); -} - -static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); + free_random_normal_distribution(rnd); } static struct ggml_tensor * llama_build_train_graphs( @@ -637,461 +442,6 @@ static struct ggml_tensor * llama_build_train_graphs( return t36; } -static int get_example_targets_batch( - struct llama_context * lctx, - const size_t * samples_begin, - const size_t * samples_size, - size_t samples_count, - const llama_token * train_data, - size_t n_train_data, - int example_id, - struct ggml_tensor * tokens_input, - struct ggml_tensor * target_probs, - bool separate_with_eos, - bool separate_with_bos, - bool fill_with_next_samples) { - - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_probs->n_dims == 3); - int n_vocab = target_probs->ne[0]; - int n_tokens = tokens_input->ne[0]; - int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_vocab == target_probs->ne[0]); - GGML_ASSERT(n_tokens == target_probs->ne[1]); - GGML_ASSERT(n_batch == target_probs->ne[2]); - - int used_samples = 0; - - ggml_set_f32(target_probs, 0.0f); - int bos = llama_token_bos(lctx); - int eos = llama_token_eos(lctx); - // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); - for (int k=0; k= sample_size && fill_with_next_samples) { - if (!sample_separation_eos) { - // insert eos token to separate samples - sample_separation_eos = true; - } else if (!sample_separation_bos) { - // insert bos token to separate samples - sample_separation_bos = true; - token = bos; - } else { - // sample separation is done, continue with next sample - sample_separation_eos = !separate_with_eos; - sample_separation_bos = !separate_with_bos; - sample_offs = 0; - sample_idx = (example_id + used_samples) % samples_count; - sample_begin = samples_begin[sample_idx]; - sample_size = samples_size[sample_idx]; - ++used_samples; - } - } - // note: no else-if here - if (sample_offs < sample_size) { - token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1); - ++sample_offs; - } - ggml_set_f32_nd(target_probs, token, i, k, 0, +1.0f); - if (i+1= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} - -// mark each byte with its utf8 unit number. -// returns the number of utf8 characters. -// e.g. when bytes == '\x61\xD0\xB0\x62', -// then utf8_units will become [0,0,1,0] -// utf8_nunits will become [1,2,2,1] and 3 is returned. -// bytes where utf8_units is zero, are the begin of an utf8 character. -static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { - size_t offs = 0; - size_t count_utf8 = 0; - while(offs < count) { - size_t len = utf8_len(bytes[offs]); - for (size_t i=0; i & out_tokens, - std::vector & out_samples_begin, - std::vector & out_samples_size) { - struct llama_file f(filename, "rb"); - - if (f.size == 0) { - out_tokens.clear(); - out_samples_begin.clear(); - out_samples_size.clear(); - printf("%s: warning: empty or not existing training data file '%s'\n", - __func__, filename); - return out_tokens.size(); - } - - // account for possible leading whitespace that will be added by tokenizer - // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] - const int n_max_tokens_overhead = 1; - - std::vector buf; - buf.resize(f.size+1); - - f.read_raw(buf.data(), f.size); - buf[f.size] = '\0'; - - std::vector utf8_units; - std::vector utf8_nunits; - utf8_units.resize(buf.size()); - utf8_nunits.resize(buf.size()); - size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); - - if (sample_start.size() == 0) { - // tokenize all data at once - out_tokens.resize(buf.size() + n_max_tokens_overhead); - - int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false); - if (n_tokens < 0) { - out_tokens.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false); - } - if (n_tokens >= 0) { - out_tokens.resize(n_tokens); - } - - // generate sample starts at all token positions - out_samples_begin.clear(); - out_samples_begin.push_back(0); - out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); - size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; - for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { - out_samples_begin.push_back(sample_begin); - out_samples_size.push_back(context_length); - } - } else { - // split data into samples and tokenize each sample - std::string data_str(buf.data(), buf.size()-1); - out_samples_begin.clear(); - out_samples_size.clear(); - out_tokens.clear(); - - // find all positions of pattern sample_start - size_t sample_begin = data_str.find(sample_start, 0); - while (sample_begin != std::string::npos) { - out_samples_begin.push_back(sample_begin); - const size_t search_start = sample_begin + sample_start.size(); - sample_begin = data_str.find(sample_start, search_start); - } - if (out_samples_begin.size() == 0) { - printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", - __func__, sample_start.c_str()); - out_samples_begin.push_back(0); - } - - out_samples_size.resize(out_samples_begin.size(), 0); - - std::vector buf_sample; - std::vector tok_sample; - - const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); - size_t found_too_big_sample = 0; - size_t found_too_small_sample = 0; - size_t found_empty_sample = 0; - size_t found_min_sample_size = SIZE_MAX; - size_t found_max_sample_size = 0; - - size_t max_token_text_size = 0; - int n_vocab = llama_n_vocab(lctx); - for (llama_token token=0; token < n_vocab; ++token) { - max_token_text_size = std::max( - max_token_text_size, - strlen(llama_token_get_text(lctx, token))); - } - - // upper bound of context byte length. - // strings with this byte length should always tokenize to at least context_length tokens. - size_t context_byte_len = max_token_text_size*context_length; - - for (unsigned i=0; i 0) { - // sample end is in the middle of an utf8 character. - // advance sample_end to the begin of the next utf8 character. - sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; - } - size_t sample_size = sample_end - sample_begin; - if (sample_size == 0) { - ++found_empty_sample; - } - - if (sample_size > 0) { - // llama_tokenize expects zero terminated string, - // copy sample into buffer and zero terminate it. - buf_sample.resize(sample_size+1); - memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); - buf_sample[sample_size] = '\0'; - - // printf("sample: '%s'\n", buf_sample.data()); - - // tokenize the sample - tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize(lctx, - buf_sample.data(), - tok_sample.data(), - tok_sample.size(), false); - if (n_tokens < 0) { - tok_sample.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, - buf_sample.data(), - tok_sample.data(), - tok_sample.size(), false); - GGML_ASSERT(n_tokens >= 0); - } - GGML_ASSERT(n_tokens <= tok_sample.size()); - - if ((size_t) n_tokens > context_length) { - ++found_too_big_sample; - } else if ((size_t) n_tokens < context_length) { - ++found_too_small_sample; - } - found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); - found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); - - // write out tokens, start and size of sample - // overwrite the string start position with the token start position - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = (size_t) n_tokens; - out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); - } else { - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = 0; - } - - } - if (found_too_big_sample > 0) { - printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", - __func__, found_too_big_sample, found_max_sample_size, context_length); - } - - if (found_too_small_sample > 0) { - printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", - __func__, found_too_small_sample, found_min_sample_size, context_length); - } - - if (found_empty_sample) { - printf("%s: warning: found %zu empty samples.\n", - __func__, found_empty_sample); - } - } - printf("%s: total number of samples: %zu\n", - __func__, out_samples_begin.size()); - - GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); - - return out_tokens.size(); -} - -static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) { - std::stringstream s_rng_state; - s_rng_state.imbue(std::locale::classic()); - s_rng_state.exceptions(std::stringstream::failbit); - s_rng_state.str(rng_state); - s_rng_state >> rng; -} - -static std::string mt19937_get_state(const std::mt19937& rng) { - std::stringstream s_rng_state; - s_rng_state.imbue(std::locale::classic()); - s_rng_state << rng; - return s_rng_state.str(); -} - -static std::string mt19937_seed_to_state(unsigned seed) { - std::mt19937 rng(seed); - return mt19937_get_state(rng); -} - -static std::string shuffle_samples( - const std::string & rng_state, - const size_t * begins, - const size_t * sizes, - size_t * shuffled_begins, - size_t * shuffled_sizes, - size_t count) { - if (count == 0) return rng_state; - - std::mt19937 rng; - mt19937_set_state(rng, rng_state); - - // sort indices by random value for each index - std::vector idcs; - { - std::vector rnd; - idcs.resize(count); - rnd.resize(count); - for (unsigned i=0; itype == b->type); - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - - return true; -} - -static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { - if (dst == NULL) { - return; - } - struct ggml_tensor * t = ggml_get_tensor(ctx, name); - GGML_ASSERT(are_same_layout(dst, t)); - memcpy(dst->data, t->data, ggml_nbytes(t)); - - if (strlen(ggml_get_name(dst)) == 0) { - ggml_set_name(dst, name); - } -} - -static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); - GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); - GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); - - uint64_t nx; - GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); - opt->nx = (size_t) nx; - - // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know - - std::string opt_type; - GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); - if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; - - GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); - GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; - - GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); - GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); - GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); - GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); - GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); - GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - } else { - die("unknown optimizer type"); - } -} - -static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); - gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); - gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); - - ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - if (opt->adam.pf) { - ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } - - gguf_add_tensor(fctx, opt->adam.m); - gguf_add_tensor(fctx, opt->adam.v); - if (opt->adam.pf) { - gguf_add_tensor(fctx, opt->adam.pf); - } - } break; - case GGML_OPT_LBFGS: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); - - ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - if (opt->lbfgs.pf) { - ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - } - ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - - gguf_add_tensor(fctx, opt->lbfgs.x); - gguf_add_tensor(fctx, opt->lbfgs.xp); - gguf_add_tensor(fctx, opt->lbfgs.g); - gguf_add_tensor(fctx, opt->lbfgs.gp); - gguf_add_tensor(fctx, opt->lbfgs.d); - if (opt->lbfgs.pf) { - gguf_add_tensor(fctx, opt->lbfgs.pf); - } - gguf_add_tensor(fctx, opt->lbfgs.lmal); - gguf_add_tensor(fctx, opt->lbfgs.lmys); - gguf_add_tensor(fctx, opt->lbfgs.lms); - gguf_add_tensor(fctx, opt->lbfgs.lmy); - } break; - } -} - static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read std::string arch; @@ -1311,22 +509,22 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex init_model(model); - read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); - read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); - read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); + copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); + copy_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); + copy_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); - read_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); - read_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); - read_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); - read_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); - read_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); + copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); + copy_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); + copy_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); + copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); + copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); + copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); + copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } } @@ -1571,25 +769,6 @@ static void save_checkpoint_file(const char * filename, const char * fn_vocab_mo gguf_free(fctx); } -static float cosine_decay(const int decay_steps, const float minimum, int step) { - if (step > decay_steps) { - step = decay_steps; - } - const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - minimum)*cosine_decay + minimum; - return decay; -} - -static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { - if (enable_restart) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; - } - } - return cosine_decay(decay_steps, minimum, step); -} - struct train_params { const char * fn_vocab_model; const char * fn_train_data; @@ -2136,29 +1315,6 @@ struct opt_callback_data { double millis_per_iter; }; -static void print_duration(double fmillis) { - if (fmillis < 1000.0f) { - printf("%.1fms", (float) fmillis); - return; - } - const int64_t one_sec = 1000; - const int64_t one_min = one_sec * 60; - const int64_t one_hour = one_min * 60; - const int64_t one_day = one_hour * 24; - - int64_t millis = (int64_t) fmillis; - int64_t days = millis/one_day; - int64_t hours = (millis - days*one_day)/one_hour; - int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; - int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; - - // to print int64_t either cast to (long long int) or use macro PRId64 from - if (days > 0) { - printf("%lldd ", (long long int) days); - } - printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); -} - static void opt_callback(void * vdata, int accum_step, float * sched) { struct opt_callback_data * data = (struct opt_callback_data *) vdata; struct train_params * params = data->params; @@ -2210,16 +1366,15 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { // exclude file saving from time measurement, by measuring last_time after saving data->last_time = ggml_time_ms(); - *sched = (opt->iter < params->warmup) - ? (float) opt->iter / (float) params->warmup - : cosine_decay_restart( - params->cos_decay_steps, - params->cos_decay_min, - opt->iter - params->warmup, - params->cos_decay_restart, - params->enable_restart); - float min_sched = params->adam_min_alpha / params->adam_alpha; - *sched = min_sched + *sched * (1.0f - min_sched); + *sched = learning_schedule( + opt->iter, + params->warmup, + params->cos_decay_steps, + params->adam_alpha, + params->adam_min_alpha, + params->cos_decay_min, + params->cos_decay_restart, + params->enable_restart); int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); if (impr_plot > 0) impr_plot = 0; @@ -2247,16 +1402,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { printf("\n"); } - int used_samples = get_example_targets_batch( + int64_t used_samples = get_example_targets_batch( data->lctx, + data->tokens_input, + data->target_probs, + data->model->shuffle_next_sample, data->shuffled_samples_begin, data->shuffled_samples_size, data->samples_count, data->tokens_data, data->tokens_size, - data->model->shuffle_next_sample, - data->tokens_input, - data->target_probs, params->separate_with_eos, params->separate_with_bos, params->fill_with_next_samples); @@ -2270,32 +1425,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) { data->model->shuffle_rng_state_current = data->model->shuffle_rng_state_next; data->model->shuffle_rng_state_next = shuffle_samples( data->model->shuffle_rng_state_current, - data->samples_begin, - data->samples_size, data->shuffled_samples_begin, data->shuffled_samples_size, + data->samples_begin, + data->samples_size, data->samples_count); data->model->shuffle_next_sample = 0; } } -static size_t hash_combine(size_t h1, size_t h2) { - return h1 ^ (h2 << 1); -} - -static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) { - std::hash h_string; - std::hash h_ull; - size_t h = h_string(std::string(fn)); - h = hash_combine(h, h_ull((unsigned long long) sample_count)); - for (size_t i=0; i< sample_count; ++i) { - h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); - h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); - } - return h; -} - int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); @@ -2435,10 +1574,10 @@ int main(int argc, char ** argv) { train_shuffled_samples_size.resize(train_samples_size.size()); model.shuffle_rng_state_next = shuffle_samples( model.shuffle_rng_state_current, - train_samples_begin.data(), - train_samples_size.data(), train_shuffled_samples_begin.data(), train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), train_samples_size.size()); printf("%s: begin training\n", __func__); @@ -2516,17 +1655,15 @@ int main(int argc, char ** argv) { size_t used_mem_before_opt = ggml_used_mem(ctx0); - opt->params.adam.sched = (opt->iter < params.warmup) - ? (float) opt->iter / (float) params.warmup - : cosine_decay_restart( - params.cos_decay_steps, - params.cos_decay_min, - opt->iter - params.warmup, - params.cos_decay_restart, - params.enable_restart); - - float min_sched = params.adam_min_alpha / params.adam_alpha; - opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched); + opt->params.adam.sched = learning_schedule( + opt->iter, + params.warmup, + params.cos_decay_steps, + params.adam_alpha, + params.adam_min_alpha, + params.cos_decay_min, + params.cos_decay_restart, + params.enable_restart); printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);