diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp
index a65ceba0e..07086a635 100644
--- a/examples/control-vector-generator/control-vector-generator.cpp
+++ b/examples/control-vector-generator/control-vector-generator.cpp
@@ -70,7 +70,7 @@ struct callback_data {
         t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
         ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
         ggml_set_name(t_layer, ggml_get_name(t));
-        print_debug_tensor(t_layer);
+        //print_debug_tensor(t_layer);
 
         if (is_eval_pos) {
             v_pos.push_back(t_layer);
@@ -99,7 +99,7 @@ struct callback_data {
 
     // delete zero rows from a given 2D tensor
     struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
-        printf("filter_nonzero_rows\n");
+        //printf("filter_nonzero_rows\n");
         auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
             // check if given row containing all zero elements
             int n_cols = t->ne[0]; // hint: should be equal to n_embd
@@ -119,7 +119,7 @@ struct callback_data {
 
         // get "n_nonzero_rows" for the output "diff_filtered"
         int n_nonzero_rows = rows_to_copy.size();
-        printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
         int n_embd = a->ne[0];
         GGML_ASSERT(n_nonzero_rows > 0);
 
@@ -138,7 +138,7 @@ struct callback_data {
             }
         }
 
-        print_debug_tensor(diff_filtered);
+        //print_debug_tensor(diff_filtered);
 
         return diff_filtered;
     }
@@ -169,7 +169,8 @@ struct train_context {
 
     // each element of the vector correspond to one layer
     // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
-    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    // NOTE (2): v_diff is transposed from v_diff_tmp
+    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
     std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
 
     // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
@@ -196,7 +197,7 @@ struct train_context {
 
     // add new rows into existing tensor in v_diff_tmp
     void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
-        GGML_ASSERT(diff_filtered.size() == n_layers - 1);
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
         for (int il = 0; il < n_layers - 1; il++) {
             auto t = diff_filtered[il];
             auto & diff_tmp = v_diff_tmp[il];
@@ -206,32 +207,46 @@ struct train_context {
         }
     }
 
-    // build the v_diff tensors from v_diff_tmp
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
     void build_v_diff() {
+        printf("build_v_diff\n");
         for (int il = 0; il < n_layers - 1; il++) {
             auto & diff_tmp = v_diff_tmp[il];
             int n_elem = diff_tmp.size() / sizeof(float);
+            GGML_ASSERT(n_elem % n_embd == 0);
             int n_rows = n_elem / n_embd;
             struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
             ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
-            // TODO: IMPORTANT!! transpose diff
-            diff->data = diff_tmp.data();
+            // copy data & transpose
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+            float * arr = (float *) diff_tmp.data();
+            for (int ir = 0; ir < n_rows; ++ir) {
+                for (int ic = 0; ic < n_embd; ++ic) {
+                    float f = arr[ir*n_embd + ic];
+                    //std::cout << ir << "," << ic << " = " << f << "\n";
+                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                }
+            }
             v_diff.push_back(diff);
+            print_debug_tensor(diff);
+            // free memory of diff_tmp
+            diff_tmp.resize(0);
         }
     }
 
     ~train_context() {
         for (auto ptr : v_final) free(ptr->data);
-        // no need to free v_diff_tmp or v_diff, since we didn't use malloc
+        for (auto ptr : v_diff) free(ptr->data);
+        // no need to free v_diff_tmp, since we didn't use malloc
         ggml_free(ctx_ggml);
     }
 };
 
 struct ctrl_params {
     /* default meta parameters */
-    bool always_reload = false;
     int n_completions = 64;
-    int n_threads = 8;
+    int n_pca_batch = 5;
+    int n_pca_iterations = 1000;
 
     /* default filepaths */
     std::string outfile = "control_vector.gguf";
@@ -295,9 +310,10 @@ static void print_usage(const char * executable) {
     printf("                              default: 'examples/control-vector-generator/completions.txt'\n");
     printf("  -nc, --num-completions N  number of lines of completions file to use\n");
     printf("                              default: 64\n");
-    printf("  -t,  --num-threads N      number of threads to use (do not confuse with gpt-opts -t)\n");
-    printf("                              default: 8\n");
-    printf("       --always-reload      reload the model for every new template to parse (not recommended)\n");
+    printf("  --batch-pca N             batch size used for PCA\n");
+    printf("                              default: 5\n");
+    printf("  --iter-pca N              number of iterations used for PCA\n");
+    printf("                              default: 1000\n");
     printf("\n");
     printf("gpt-opts:\n");
     printf("  other options from main\n");
@@ -370,10 +386,10 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params)
                 throw std::invalid_argument("error: missing argument for " + arg);
             }
         }
-        if (arg == "--num-threads" || arg == "-t") {
+        if (arg == "--pca-batch") {
             if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) {
                 try {
-                    params.n_threads = std::stoi(argv[arg_idx]);
+                    params.n_pca_batch = std::stoi(argv[arg_idx]);
                 }
                 catch (const std::invalid_argument & ex) {
                     throw std::invalid_argument("error: invalid argument for " + arg);
@@ -383,9 +399,18 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params)
                 throw std::invalid_argument("error: missing argument for " + arg);
             }
         }
-        if (arg == "--always-reload") {
-            params.always_reload = true;
-            skipme += 1;
+        if (arg == "--pca-iter") {
+            if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) {
+                try {
+                    params.n_pca_iterations = std::stoi(argv[arg_idx]);
+                }
+                catch (const std::invalid_argument & ex) {
+                    throw std::invalid_argument("error: invalid argument for " + arg);
+                }
+                skipme += 2;
+            } else {
+                throw std::invalid_argument("error: missing argument for " + arg);
+            }
         }
         // TODO it might be nice QoL to have single positive/negative args
         // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params
@@ -427,7 +452,7 @@ static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool
 
 static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
     auto * cb_data = (callback_data *) user_data;
-    auto ggml_ne_string = [](const ggml_tensor * t) -> std::string {
+    /*auto ggml_ne_string = [](const ggml_tensor * t) -> std::string {
         std::string str;
         for (int i = 0; i < GGML_MAX_DIMS; ++i) {
             str += std::to_string(t->ne[i]);
@@ -436,7 +461,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
             }
         }
         return str;
-    };
+    };*/
 
     static const char * l_out_name = "l_out";
     const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
@@ -473,6 +498,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
 
     for (size_t i = 0; i < v_ctrl.size(); ++i) {
         gguf_add_tensor(ctx, v_ctrl[i]);
+        print_debug_tensor(v_ctrl[i]);
         printf("Added tensor: %s\n", v_ctrl[i]->name);
     }
 
@@ -489,7 +515,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
  * Load prompt files and completion file.
  * Then format each pair of prompt + completion to make an entry.
  */
-int prepare_entries(ctrl_params & cparams) {
+static int prepare_entries(ctrl_params & cparams) {
     // load prompts
     std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file);
     std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file);
@@ -511,7 +537,7 @@ int prepare_entries(ctrl_params & cparams) {
         // TODO make this dynamic - allow the user to change it somehow - and adapt based on model
         return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
     };
-    for (int i = 0; i < positive_prompts.size(); ++i) {
+    for (size_t i = 0; i < positive_prompts.size(); ++i) {
         for (auto & cmpl : completions) {
             // TODO replicate the truncations done by the python implementation
             cparams.positive_entries.push_back(format_template(positive_prompts[i], cmpl));
@@ -553,7 +579,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
-    int n_ctx = llama_n_ctx(ctx);
+    // int n_ctx = llama_n_ctx(ctx);
     int n_layers = llama_n_layer(model);
     int n_embd = llama_n_embd(model);
     // get model hint param (a.k.a model arch name)
@@ -574,29 +600,13 @@ int main(int argc, char ** argv) {
     // init train_context
     train_context ctx_train(n_embd, n_layers);
 
-    int token_ct = 0;
-
     for(size_t i = 0; i < cparams.positive_entries.size(); ++i) {
         tokenized_prompt t = tokenized_prompts[i];
         cb_data.n_layers = n_layers;
         cb_data.n_tokens = t.max_seq_len;
 
-        // need to reload the model so it doesn't run out of context
-        // this should scale with -c option passed by main
-        token_ct += 2 * t.max_seq_len;
-        if (token_ct > n_ctx || cparams.always_reload) {
-            //break;
-            llama_free(ctx);
-            llama_free_model(model);
-            std::tie(model, ctx) = llama_init_from_gpt_params(params);
-            token_ct = 2 * t.max_seq_len;
-        }
-        if (token_ct > n_ctx) {
-            fprintf(stderr, "context size exceeded on iteration %zu\n", i);
-            break;
-        }
-
-        printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", 
+        printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", 
+            i+1, t.tokens_pos.size(),
             tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
             tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
             t.max_seq_len);
@@ -610,12 +620,10 @@ int main(int argc, char ** argv) {
         auto v_diff_filtered = cb_data.calc_diff();
 
         // save & concat the filtered v_diff to ctx_train
-        printf("concat_diff_tmp\n");
         ctx_train.concat_diff_tmp(v_diff_filtered);
 
         // reset for next iteration
         cb_data.reset();
-        printf("reset\n");
     }
 
     // done with the model, we can now free it to make gain some memory
@@ -628,8 +636,10 @@ int main(int argc, char ** argv) {
 
     // run PCA
     PCA::pca_params pca_params;
+    pca_params.n_threads = params.n_threads;
+    pca_params.n_batch = cparams.n_pca_batch;
+    pca_params.n_iterations = cparams.n_pca_iterations;
     PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
-    exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!!
 
     // write output vectors to gguf
     export_gguf(ctx_train.v_final, cparams.outfile, model_hint);
diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp
index 67b914a34..cd1760de9 100644
--- a/examples/control-vector-generator/pca.hpp
+++ b/examples/control-vector-generator/pca.hpp
@@ -38,10 +38,15 @@ struct pca_params {
     int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used
     int n_iterations = 1000;
     float tolerance = 1e-7;
+
+    // for debugging
+    int i_layer = 0;
+    int n_layers = 0;
 };
 
 // result from each iteration
 struct pca_result {
+    struct ggml_tensor * calculated_square = NULL;
     std::vector<struct ggml_tensor *> eigenvectors;
     std::vector<float> distances;
 };
@@ -162,7 +167,6 @@ static struct ggml_cgraph * build_graph_piter(
     // turn v_diff_original into square matrix if needed
     struct ggml_tensor * tmp_square;
     if (calc_square) {
-        print_debug_tensor(model.dev_input);
         tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
         ggml_set_name(tmp_square, "tmp_square");
     }
@@ -229,17 +233,17 @@ static ggml_status compute_piter(
             }
             return i;
         };
-        // get output nodes
+        result.calculated_square = NULL;
         result.eigenvectors.clear();
         result.distances.clear();
         result.eigenvectors.resize(params.n_batch);
         result.distances.resize(params.n_batch);
+        // get output nodes
         for (int i = 0; i < gf->n_nodes; ++i) {
             auto node = gf->nodes[i];
             int iter = -1;
             // find b_tensor (without copying data from device)
             if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
-                print_debug_tensor(node, false);
                 result.eigenvectors[iter] = node;
             }
             // find distances, then copy data from device
@@ -247,7 +251,11 @@ static ggml_status compute_piter(
                 float d;
                 ggml_backend_tensor_get(node, &d, 0, sizeof(float));
                 result.distances[iter] = d;
-                std::cout << node->name << " = " << d << "\n";
+                // std::cout << node->name << " = " << d << "\n";
+            }
+            // find tmp_square if it exists (without copying data from device)
+            if (std::string(node->name) == "tmp_square") {
+                result.calculated_square = node;
             }
         }
     }
@@ -258,23 +266,22 @@ static void power_iteration(
         const struct pca_params & params,
         struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
         struct ggml_tensor * output) {
-    printf("in power iteration\n");
-    //int n_embd = input->ne[1];
+    //printf("in power iteration\n");
     struct pca_model model(input);
 
     ggml_gallocr_t allocr = NULL;
     struct pca_result result;
-    struct ggml_tensor * last_eigenvector;
+    struct ggml_tensor * last_eigenvector = NULL;
 
-    int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations
-    for (int iter = 0; iter < n_iter; ++iter) {
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    for (int iter = 0; iter < n_iters; ++iter) {
         bool calc_square = (iter == 0); // only need to calculate square for first iteration
         if (allocr) {
             ggml_gallocr_free(allocr);
         }
         allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
         struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
-        ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
         compute_piter(params, model, gf, allocr, result);
 
         for (size_t k = 0; k < result.distances.size(); ++k) {
@@ -283,31 +290,44 @@ static void power_iteration(
                 break; // done
             }
         }
-        
-        break; // FIXME
+
+        if (calc_square) {
+            // copy and store the square matrix if needed
+            GGML_ASSERT(result.calculated_square != NULL);
+            std::vector<uint8_t> tmp_buf(ggml_nbytes(model.dev_square));
+            ggml_backend_tensor_get(result.calculated_square, tmp_buf.data(), 0, tmp_buf.size());
+            ggml_backend_tensor_set(model.dev_square, tmp_buf.data(), 0, tmp_buf.size());
+        }
+
+        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
+            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
     }
 
+    // get output tensor
+    GGML_ASSERT(last_eigenvector);
     ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
-    print_debug_tensor(output);
+    //print_debug_tensor(output);
     ggml_gallocr_free(allocr);
 }
 
 static void run_pca(
-        const struct pca_params & params,
-        const std::vector<struct ggml_tensor *> & v_input,
+        struct pca_params & params,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
         const std::vector<struct ggml_tensor *> & v_output) {
     printf("Running PCA...\n");
-    int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m]
     for (size_t il = 0; il < v_input.size(); ++il) {
-        print_debug_tensor(v_input[il]);
+
         // prepare output vector
         struct ggml_tensor * ctrl_out = v_output[il];
         auto name = std::string("direction.") + std::to_string(il + 1);
         ggml_set_name(ctrl_out, name.c_str());
+
         // run power_iteration
+        params.i_layer = il;
+        params.n_layers = v_input.size();
         power_iteration(params, v_input[il], ctrl_out);
-        printf("Done with layer %d\n", il);
-        print_debug_tensor(ctrl_out);
+        printf("DONE layer %ld / %ld\n", il+1, v_input.size());
+        //print_debug_tensor(ctrl_out);
     }
     printf("Done with PCA.\n");
 }