From aebccdbf00843277b6017178710d78b9973370fc Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Mon, 31 Jul 2023 09:33:57 -0400
Subject: [PATCH] fixing bug that didnt unroll the 1d karpathy arrays

---
 .../convert-llama2c-to-ggml.cpp               | 120 ++++++++++--------
 1 file changed, 70 insertions(+), 50 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 28f74066e..a2a12de65 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -45,8 +45,8 @@ typedef struct {
     // final rmsnorm
     float* rms_final_weight; // (dim,)
     // freq_cis for RoPE relatively positional embeddings
-    float* freq_cis_real; // (seq_len, dim/2)
-    float* freq_cis_imag; // (seq_len, dim/2)
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
     // (optional) classifier weights for the logits, on the last layer
     float* wcls;
 } TransformerWeights;
@@ -63,9 +63,9 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
     if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
     if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
     if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
-    int head_size = p->dim / p->n_heads;
-    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    //int head_size = p->dim / p->n_heads;
+    // if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    // if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
     return 0;
 }
 
@@ -96,7 +96,7 @@ void malloc_weights(TransformerWeights* w, Config* p) {
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
     w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
     w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
@@ -104,11 +104,11 @@ void malloc_weights(TransformerWeights* w, Config* p) {
     w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
     printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
 
-    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+    // w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
 
-    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+    // w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
 
     // ensure all mallocs went fine
     // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
@@ -131,8 +131,8 @@ void free_weights(TransformerWeights* w) {
     free(w->w2);
     free(w->w3);
     free(w->rms_final_weight);
-    free(w->freq_cis_real);
-    free(w->freq_cis_imag);
+    // free(w->freq_cis_real);
+    // free(w->freq_cis_imag);
 }
 
 void print_sample_weights(TransformerWeights *w){
@@ -149,8 +149,8 @@ void print_sample_weights(TransformerWeights *w){
     printf("%f\n", w->w2[0]);
     printf("%f\n", w->w3[0]);
     printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->freq_cis_real[0]);
-    printf("%f\n", w->freq_cis_imag[0]);
+    // printf("%f\n", w->freq_cis_real[0]);
+    // printf("%f\n", w->freq_cis_imag[0]);
     printf("------------------------------------------------------------------\n");
 
     
@@ -213,10 +213,10 @@ struct my_llama_model {
     struct ggml_tensor * tok_embeddings;
 
     struct ggml_tensor * norm;
-    // struct ggml_tensor * output;
+    struct ggml_tensor * output;
 
-    struct ggml_tensor * freq_cis_real;
-    struct ggml_tensor * freq_cis_imag;
+    // struct ggml_tensor * freq_cis_real;
+    // struct ggml_tensor * freq_cis_imag;
 
     std::vector<my_llama_layer> layers;
 
@@ -262,33 +262,33 @@ void init_model(struct my_llama_model * model) {
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
 
-    // model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
 
-    model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+    // model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
     
-    model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+    // model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
 
     // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
 
     printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
 
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__,n_ff, n_embd, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
     
 
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
-    // ggml_set_name(model->output,         "output.weight");
-    ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
-    ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
+    ggml_set_name(model->output,         "output.weight");
+    // ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
+    // ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
@@ -305,10 +305,14 @@ void init_model(struct my_llama_model * model) {
 
         layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
 
+        // layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        // layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        // layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+ 
         ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
 
         ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
@@ -352,7 +356,7 @@ int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = get_f32_2d(probs, k, i);
-        printf(" %.2f", p);
+        printf(" %f", p);
     }
     printf("\n");
 }
@@ -656,34 +660,50 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // w->token_embedding_table -> model->tok_embeddings
     // float*                   -> struct ggml_tensor
     stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
     // print_row(model->tok_embeddings, 0);
 
     stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
-    stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
-    stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
+    print_row(model->norm, 0);
+    //stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
+    //stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
 
     // for rms-att-weight 
     int row_length = model->hparams.n_embd;
+    const auto & hparams = model->hparams;
+    int n_ff = get_n_ff(&hparams);
+    //int n_ff = model->hparams.n_embd;
+    //const auto & hparams = model->hparams;
+    //int row_length = get_n_ff(&hparams);
+
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
-        // 2d        
+        // 1d        
         stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
         stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        
+        //stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        //stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff]);
+        
+        //stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*n_ff*row_length]);
     }
     
     // write tensors
     write_tensor(&file, model->tok_embeddings);
     write_tensor(&file, model->norm);
-    // write_tensor(&file, model->output); // ?
-    write_tensor(&file, model->freq_cis_real);
-    write_tensor(&file, model->freq_cis_imag);
+    write_tensor(&file, model->output); // ?
+    // write_tensor(&file, model->freq_cis_real);
+    // write_tensor(&file, model->freq_cis_imag);
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        
         auto & layer = model->layers[i];