From 91eb33585b3c2ff33b33cc129a0365befca1fcab Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Sun, 12 Nov 2023 18:23:06 -0800 Subject: [PATCH] finetune : zero the loraB initial vectors Without this, the first iteration is starting out far from the base model, instead of exactly on it. Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs (though it departs from the paper in using a different distribution for the other vector, in some cases). --- common/train.cpp | 45 ++++++++++++++++++++++++++++++++++ common/train.h | 1 + examples/finetune/finetune.cpp | 24 +++++++++--------- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index bc15b7a03..ceb9b57b0 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -68,6 +68,51 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) free(rnd); } +struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) { + float scale = 1.0f; // xavier + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = 0.0f; + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = 0.0f; + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = 0.0f; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = 0.0f; + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { diff --git a/common/train.h b/common/train.h index d86c93cc4..e1758ddf2 100644 --- a/common/train.h +++ b/common/train.h @@ -127,6 +127,7 @@ struct random_uniform_distribution * init_random_uniform_distribution(int seed, void free_random_normal_distribution (struct random_normal_distribution * rnd); void free_random_uniform_distribution(struct random_uniform_distribution * rnd); +struct ggml_tensor * zero_tensor (struct ggml_tensor * tensor); struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index fa7dbe496..04a7a986b 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); randomize_tensor_normal(lora->tok_embeddings_a, rnd); - randomize_tensor_normal(lora->tok_embeddings_b, rnd); + zero_tensor(lora->tok_embeddings_b); randomize_tensor_normal(lora->norm_a, rnd); - randomize_tensor_normal(lora->norm_b, rnd); + zero_tensor(lora->norm_b); randomize_tensor_normal(lora->output_a, rnd); - randomize_tensor_normal(lora->output_b, rnd); + zero_tensor(lora->output_b); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = lora->layers[i]; randomize_tensor_normal(layer.attention_norm_a, rnd); - randomize_tensor_normal(layer.attention_norm_b, rnd); + zero_tensor(layer.attention_norm_b); randomize_tensor_normal(layer.wq_a, rnd); - randomize_tensor_normal(layer.wq_b, rnd); + zero_tensor(layer.wq_b); randomize_tensor_normal(layer.wk_a, rnd); - randomize_tensor_normal(layer.wk_b, rnd); + zero_tensor(layer.wk_b); randomize_tensor_normal(layer.wv_a, rnd); - randomize_tensor_normal(layer.wv_b, rnd); + zero_tensor(layer.wv_b); randomize_tensor_normal(layer.wo_a, rnd); - randomize_tensor_normal(layer.wo_b, rnd); + zero_tensor(layer.wo_b); randomize_tensor_normal(layer.ffn_norm_a, rnd); - randomize_tensor_normal(layer.ffn_norm_b, rnd); + zero_tensor(layer.ffn_norm_b); randomize_tensor_normal(layer.w1_a, rnd); - randomize_tensor_normal(layer.w1_b, rnd); + zero_tensor(layer.w1_b); randomize_tensor_normal(layer.w2_a, rnd); - randomize_tensor_normal(layer.w2_b, rnd); + zero_tensor(layer.w2_b); randomize_tensor_normal(layer.w3_a, rnd); - randomize_tensor_normal(layer.w3_b, rnd); + zero_tensor(layer.w3_b); } free_random_normal_distribution(rnd);