finetune : zero the loraB initial vectors
Without this, the first iteration is starting out far from the base model, instead of exactly on it. Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs (though it departs from the paper in using a different distribution for the other vector, in some cases).
This commit is contained in:
parent
34b0a08207
commit
91eb33585b
3 changed files with 58 additions and 12 deletions
|
@ -68,6 +68,51 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
|
|||
free(rnd);
|
||||
}
|
||||
|
||||
struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) {
|
||||
float scale = 1.0f; // xavier
|
||||
switch (tensor->n_dims) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
|
||||
*dst = 0.0f;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||
*dst = 0.0f;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
|
||||
*dst = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
|
||||
*dst = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
die("Unsupported tensor->n_dims");
|
||||
};
|
||||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||
float scale = 1.0f; // xavier
|
||||
switch (tensor->n_dims) {
|
||||
|
|
|
@ -127,6 +127,7 @@ struct random_uniform_distribution * init_random_uniform_distribution(int seed,
|
|||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
||||
|
||||
struct ggml_tensor * zero_tensor (struct ggml_tensor * tensor);
|
||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
||||
|
||||
|
|
|
@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
|||
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||
|
||||
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
||||
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
|
||||
zero_tensor(lora->tok_embeddings_b);
|
||||
randomize_tensor_normal(lora->norm_a, rnd);
|
||||
randomize_tensor_normal(lora->norm_b, rnd);
|
||||
zero_tensor(lora->norm_b);
|
||||
randomize_tensor_normal(lora->output_a, rnd);
|
||||
randomize_tensor_normal(lora->output_b, rnd);
|
||||
zero_tensor(lora->output_b);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
auto & layer = lora->layers[i];
|
||||
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
||||
randomize_tensor_normal(layer.attention_norm_b, rnd);
|
||||
zero_tensor(layer.attention_norm_b);
|
||||
|
||||
randomize_tensor_normal(layer.wq_a, rnd);
|
||||
randomize_tensor_normal(layer.wq_b, rnd);
|
||||
zero_tensor(layer.wq_b);
|
||||
randomize_tensor_normal(layer.wk_a, rnd);
|
||||
randomize_tensor_normal(layer.wk_b, rnd);
|
||||
zero_tensor(layer.wk_b);
|
||||
randomize_tensor_normal(layer.wv_a, rnd);
|
||||
randomize_tensor_normal(layer.wv_b, rnd);
|
||||
zero_tensor(layer.wv_b);
|
||||
randomize_tensor_normal(layer.wo_a, rnd);
|
||||
randomize_tensor_normal(layer.wo_b, rnd);
|
||||
zero_tensor(layer.wo_b);
|
||||
|
||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||
randomize_tensor_normal(layer.ffn_norm_b, rnd);
|
||||
zero_tensor(layer.ffn_norm_b);
|
||||
|
||||
randomize_tensor_normal(layer.w1_a, rnd);
|
||||
randomize_tensor_normal(layer.w1_b, rnd);
|
||||
zero_tensor(layer.w1_b);
|
||||
randomize_tensor_normal(layer.w2_a, rnd);
|
||||
randomize_tensor_normal(layer.w2_b, rnd);
|
||||
zero_tensor(layer.w2_b);
|
||||
randomize_tensor_normal(layer.w3_a, rnd);
|
||||
randomize_tensor_normal(layer.w3_b, rnd);
|
||||
zero_tensor(layer.w3_b);
|
||||
}
|
||||
|
||||
free_random_normal_distribution(rnd);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue