diff --git a/examples/cvector-generator/mini-tests/test-vanilla-pca.cpp b/examples/cvector-generator/mini-tests/test-vanilla-pca.cpp
index 72405762b..7a78e2452 100644
--- a/examples/cvector-generator/mini-tests/test-vanilla-pca.cpp
+++ b/examples/cvector-generator/mini-tests/test-vanilla-pca.cpp
@@ -4,20 +4,25 @@
 #include "ggml.h"
 #include "../pca.hpp"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
+#include "ggml-cpp.h"
+#include "ggml-backend.h"
 
 #include <cstdio>
 #include <cstring>
 
 // Function to run PCA and print results
-static void run_pca_test(struct ggml_context *ctx, float *matrix, int rows, int cols) {
-    // struct ggml_tensor *input_tensor = create_tensor(ctx, matrix, rows, cols);
+static void run_pca_test(float *matrix, int rows, int cols) {
+    // Initialize ggml context
+    size_t ctx_size = 0;
+    ctx_size += rows * cols * ggml_type_size(GGML_TYPE_F32);
+    ctx_size += 1 * ggml_tensor_overhead();
+
+    struct ggml_init_params ctx_params {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+    struct ggml_context * ctx = ggml_init(ctx_params);
     struct ggml_tensor *input_tensor = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, rows, cols);
     memcpy(input_tensor->data, matrix, rows * cols * sizeof(float));
 
@@ -37,32 +42,10 @@ static void run_pca_test(struct ggml_context *ctx, float *matrix, int rows, int
     printf("\nEigenvalue: %f\n", result.explained_variance);
 
     free(result.principal_component);
+    ggml_free(ctx);
 }
 
 int main() {
-    // Initialize ggml context
-    size_t ctx_size = 0;
-    ctx_size += 4 * 4 * ggml_type_size(GGML_TYPE_F32);
-    ctx_size += 10 * 10 * ggml_type_size(GGML_TYPE_F32);
-    ctx_size += 3 * 3 * ggml_type_size(GGML_TYPE_F32);
-    ctx_size += 3 * 3 * ggml_type_size(GGML_TYPE_F32);
-    ctx_size += 4 * ggml_tensor_overhead();
-    ctx_size += 1024;
-
-    // Step 2. Initialize GGML Context
-    struct ggml_init_params ctx_params {
-        ctx_size,  // mem_size
-        NULL,      // mem_buffer
-        false,      // no_alloc
-    };
-    struct ggml_context * ctx = ggml_init(ctx_params);
-
-
-    if (ctx == NULL) {
-        printf("Failed to initialize ggml context\n");
-        return 1;
-    }
-
     // Define matrices
     float input_matrix1[16] = {
         -0.124132, 0.740341, -0.452462, 0.777050,
@@ -98,19 +81,18 @@ int main() {
 
     // Run PCA for each matrix
     printf("Testing Matrix 1:\n");
-    run_pca_test(ctx, input_matrix1, 4, 4);
+    run_pca_test(input_matrix1, 4, 4);
 
     printf("\nTesting Matrix 2:\n");
-    run_pca_test(ctx, input_matrix2, 10, 10);
+    run_pca_test(input_matrix2, 10, 10);
 
     printf("\nTesting Matrix 3:\n");
-    run_pca_test(ctx, input_matrix3, 3, 3);
+    run_pca_test(input_matrix3, 3, 3);
 
     printf("\nTesting Matrix 4:\n");
-    run_pca_test(ctx, input_matrix4, 3, 3);
+    run_pca_test(input_matrix4, 3, 3);
 
     // Cleanup
-    ggml_free(ctx);
     return 0;
 }
 
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 7a84ac1d5..a6ecd22ae 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -51,7 +51,6 @@ static void compute_covariance(struct pca_params &pca_params,
                         struct ggml_backend * backend) {
 
     size_t ctx_size = 0;
-    ctx_size += 7 * X->ne[0] * X->ne[1] * ggml_type_size(GGML_TYPE_F32);
     ctx_size += 7 * ggml_tensor_overhead();
     ctx_size += ggml_graph_overhead();
     ctx_size += 1024;
@@ -105,7 +104,6 @@ static void compute_cross_covariance(struct pca_params &pca_params,
                               struct ggml_backend * backend) {
 
     size_t ctx_size = 0;
-    ctx_size += 9 * A->ne[0] * B->ne[1] * ggml_type_size(GGML_TYPE_F32);
     ctx_size += 9 * ggml_tensor_overhead();
     ctx_size += ggml_graph_overhead();
     ctx_size += 1024;
@@ -280,7 +278,6 @@ static void run_single_pca(struct pca_params &pca_params,
 
     // Compute the context size needed
     size_t ctx_size = 0;
-    ctx_size += m * m * ggml_type_size(GGML_TYPE_F32);
     ctx_size += 1 * ggml_tensor_overhead();
 
     // Step 2. Initialize GGML Context
diff --git a/examples/cvector-generator/vanilla_pca.hpp b/examples/cvector-generator/vanilla_pca.hpp
deleted file mode 100644
index b4350db82..000000000
--- a/examples/cvector-generator/vanilla_pca.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-#include "common.h"
-#include "llama.h"
-#include "ggml.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#include <cstdio>
-#include <ctime>
-#include <random>
-#include <string>
-#include <tuple>
-#include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-
-#define DEBUG_POS 5
-
-static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
-    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
-    if (!with_data) return;
-    printf("%s: %s[0] = [", __func__, t->name);
-    for (size_t i = 0; i <= DEBUG_POS; i++) {
-        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
-    }
-    printf(" ... ]\n");
-}
-
-// begin vanilla pca namespace
-namespace PCA {
-
-// input params for PCA computations
-struct pca_params {
-    int n_threads    = 1;
-    int n_batch      = 20; // number of iterations do to in one batch. larger the batch, more memory is used
-    int n_iterations = 1000;
-    float tolerance  = 1e-7;
-};
-
-// result from each iteration
-struct pca_result {
-    struct ggml_tensor * principal_component; // eigenvectors of the covariance matrix
-    float explained_variance;                 // eigenvalues of the covariance matrix
-};
-
-void compute_covariance(struct pca_params &pca_params,
-                        struct ggml_tensor * X,
-                        struct ggml_tensor * covariance,
-                        struct ggml_backend * backend) {
-
-    // Memory allocation
-    struct ggml_cgraph  * gf  = NULL;
-    struct ggml_context * ctx = NULL;
-    struct ggml_init_params ctx_params = {
-        ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        NULL,
-        true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx = ggml_init(ctx_params);
-    gf  = ggml_new_graph(ctx);
-
-    // Step 0: Transpose the input because of row-major
-    X = ggml_cont(ctx, ggml_transpose(ctx, X));
-
-    // Step 1: Compute the mean for each feature
-    struct ggml_tensor * mean           = ggml_repeat(ctx, ggml_mean(ctx, X), X); // mean with trick to make it easier to sub
-    struct ggml_tensor * centered_data  = ggml_sub(ctx, X, mean);
-
-    // Step 2: Compute the covariance matrix
-    struct ggml_tensor * cov            = ggml_mul_mat(ctx, centered_data, centered_data); // C = X * X^T
-    cov                                 = ggml_scale(ctx, cov, 1.0/(X->ne[0]-1));
-    ggml_build_forward_expand(gf, cov);
-
-    // Step 3: Create ggml_gallocr for graph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // Step 4: Check if CPU and compute the result of the graph
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, pca_params.n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // Step 5: Store covariance matrix in the data pointer
-    struct ggml_tensor * result = ggml_graph_node(gf, ggml_graph_n_nodes(gf)-1);
-    float * result_data = (float*) malloc(ggml_nbytes(result));
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    covariance->data = result_data;
-
-    // Step 6: Free memory
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-}
-
-static void compute_cross_covariance(struct pca_params &pca_params,
-                              struct ggml_tensor * A,
-                              struct ggml_tensor * B,
-                              struct ggml_tensor * cross_covariance,
-                              struct ggml_backend * backend) {
-
-    // Memory allocation
-    struct ggml_cgraph  * gf  = NULL;
-    struct ggml_context * ctx = NULL;
-    struct ggml_init_params ctx_params = {
-        ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        NULL,
-        true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx = ggml_init(ctx_params);
-    gf  = ggml_new_graph(ctx);
-
-    // Step 1: Compute matrices of cross_covariance
-    struct ggml_tensor * AT     = ggml_cont(ctx, ggml_transpose(ctx, A));
-    struct ggml_tensor * BT     = ggml_cont(ctx, ggml_transpose(ctx, B));
-    struct ggml_tensor * AT_B   = ggml_mul_mat(ctx, AT, BT);
-    struct ggml_tensor * BT_A   = ggml_cont(ctx, ggml_transpose(ctx, AT_B));
-
-    // Step 2: Compute the covariance matrix
-    struct ggml_tensor * cross_cov      = ggml_add(ctx, AT_B, BT_A);
-    cross_cov                           = ggml_scale(ctx, cross_cov, 0.5);
-    ggml_build_forward_expand(gf, cross_cov);
-
-    // Step 3: Create ggml_gallocr for graph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // Step 4: Check if CPU and compute the result of the graph
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, pca_params.n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // Step 5: Store covariance matrix in the data pointer
-    struct ggml_tensor * result = ggml_graph_node(gf, ggml_graph_n_nodes(gf)-1);
-    float * result_data = (float*) malloc(ggml_nbytes(result));
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    cross_covariance->data = result_data;
-
-    // Step 6: Free memory
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-}
-
-// Find the dominant eigenvector of tensor M
-static void power_iteration(struct pca_params &pca_params,
-                     struct ggml_tensor * M,
-                     struct pca_result &result,
-                     struct ggml_backend * backend) {
-
-    int m = M->ne[1];
-
-    // Initialize random vector
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-    float * b = (float*) malloc(m * sizeof(float));
-    for (int i = 0; i < m; i++) {
-        b[i] = dist(gen);
-    };
-    float eigenvalue = 0;
-
-    // Iterate
-    int n_rounds = pca_params.n_iterations / pca_params.n_batch;
-    for(int i = 0; i < n_rounds; i++) {
-
-        // Memory allocation
-        struct ggml_cgraph  * gf  = NULL;
-        struct ggml_context * ctx = NULL;
-        struct ggml_init_params params = {
-            ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-            NULL,
-            true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-        };
-        ctx = ggml_init(params);
-        gf  = ggml_new_graph(ctx);
-
-        // Fill current eigen vector
-        struct ggml_tensor * e_curr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, m);
-        struct ggml_tensor * e_prev = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, m);
-
-        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-        ggml_backend_tensor_set(e_curr, b, 0, ggml_nbytes(e_curr));
-        ggml_backend_tensor_set(e_prev, b, 0, ggml_nbytes(e_curr));
-
-        struct ggml_tensor * e_next   = NULL;
-        struct ggml_tensor * e_norm   = NULL;
-        for(int j = 0; j < pca_params.n_batch; j++) {
-            // Compute next candidate vector multiplying M with the current vector
-            e_next = ggml_mul_mat(ctx, M, e_curr);
-
-            // Compute the norm of the new vector (and normalize it)
-            // this will give us the next eigenvector and eigenvalue
-            e_norm = ggml_sqrt_inplace(ctx, ggml_sum_rows(ctx, ggml_sqr(ctx, e_next)));
-            e_curr = ggml_div_inplace(ctx, e_next, e_norm);
-            ggml_format_name(e_norm, "eigenvalue_%d", j);
-            ggml_format_name(e_curr, "eigenvector_%d", j);
-
-            // Update graph
-            ggml_build_forward_expand(gf, e_curr);
-        }
-
-        // Compute the similarity between the current eigenvector and the previous (dot product)
-        struct ggml_tensor * similarity = ggml_mul_mat(ctx, e_curr, e_prev);
-        ggml_build_forward_expand(gf, similarity);
-
-        // Create ggml_gallocr for graph computation
-        ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-        ggml_gallocr_alloc_graph(allocr, gf);
-
-        // Check if CPU and compute the result of the graph
-        if (ggml_backend_is_cpu(backend)) {
-            ggml_backend_cpu_set_n_threads(backend, pca_params.n_threads);
-        }
-        ggml_status graph_status = ggml_backend_graph_compute(backend, gf);
-
-        // Get graph results (eigenvector and eigenvalue) and store it in b and eigenvalue
-        if(graph_status == GGML_STATUS_SUCCESS){
-
-            // Similarity is the last node in the graph
-            struct ggml_tensor * similarity_tensor = ggml_graph_node(gf, ggml_graph_n_nodes(gf)-1);
-            float similarity = (float)((float*) similarity_tensor->data)[0];
-
-            // Eigenvector is the second last node in the graph
-            // struct ggml_tensor * eigenvector_tensor = gf->nodes[gf->n_nodes-2];
-            struct ggml_tensor * eigenvector_tensor = ggml_graph_node(gf,ggml_graph_n_nodes(gf)-2);
-            float * eigenvector_data = (float*) malloc(ggml_nbytes(eigenvector_tensor));
-            ggml_backend_tensor_get(eigenvector_tensor, eigenvector_data, 0, ggml_nbytes(eigenvector_tensor));
-            b = eigenvector_data;
-
-            // Eigenvalue computation is 1 operation before eigenvector computation
-            // struct ggml_tensor * eigenvalue_tensor = gf->nodes[gf->n_nodes-3];
-            struct ggml_tensor * eigenvalue_tensor = ggml_graph_node(gf, ggml_graph_n_nodes(gf)-3);
-            eigenvalue = (float)((float*) eigenvalue_tensor->data)[0];
-
-            // Check if the similarity is close enough to 1, if so we converged and should break
-            if(1 - similarity < pca_params.tolerance)
-                break;
-        }
-
-        // Free memory
-        ggml_gallocr_free(allocr);
-        ggml_free(ctx);
-    }
-
-    // Store result
-    result.principal_component->data = b;
-    result.explained_variance = eigenvalue;
-    return;
-}
-
-static void run_single_pca(struct pca_params &pca_params,
-             struct ggml_tensor * X,
-             struct pca_result &result
-             ) {
-
-    ggml_set_name(X, "input_tensor");
-
-    int m = X->ne[1]; // Number of features
-
-    // Step 1. Initialize GGML Backend
-    ggml_backend_t backend = NULL;
-    #ifdef GGML_USE_CUDA
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        backend = ggml_backend_cuda_init(0); // init device 0
-        if (!backend) { fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); }
-    #endif
-    // If there aren't GPU Backends fallback to CPU backend
-    if (!backend) { backend = ggml_backend_cpu_init(); }
-
-    // Compute the context size needed
-    size_t ctx_size = 2 * ggml_tensor_overhead();
-
-    // Step 2. Initialize GGML Context
-    struct ggml_init_params ctx_params {
-        ctx_size,  // mem_size
-        NULL,      // mem_buffer
-        true,      // no_alloc
-    };
-    struct ggml_context * ctx = ggml_init(ctx_params);
-
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-    // Step 3. Compute the data covariance matrix
-    struct ggml_tensor  * covariance = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, m, m);
-    ggml_set_name(covariance, "covariance_tensor");
-    compute_covariance(pca_params, X, covariance, backend);
-
-    // Step 4. Power iteration
-    result.principal_component = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, m);
-    power_iteration(pca_params, covariance, result, backend);
-
-    // Free ggml context and backend
-    ggml_free(ctx);
-    ggml_backend_free(backend);
-}
-
-
-static void run_pca(
-        struct pca_params & params,
-        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
-        const std::vector<struct ggml_tensor *> & v_output) {
-
-    for (size_t i = 0; i < v_input.size(); i++) {
-        struct pca_result result;
-        run_single_pca(params, v_input[i], result);
-        ggml_backend_tensor_get(result.principal_component, v_output[i]->data, 0, ggml_nbytes(result.principal_component));
-    }
-}
-
-// end namespace
-}