From 9e39571fc2ee5a76230215dcde85b216b0a3e086 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 11:45:16 +0200 Subject: [PATCH] add n_batch for pca --- .../control-vector-generator.cpp | 8 +- examples/control-vector-generator/pca.hpp | 300 +++++++++++------- 2 files changed, 187 insertions(+), 121 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 2e7f36635..a65ceba0e 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -188,7 +188,9 @@ struct train_context { for (int il = 0; il < n_layers - 1; il++) { std::vector empty; v_diff_tmp.push_back(empty); - v_final.push_back(ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd)); + auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); + t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible + v_final.push_back(t); } } @@ -625,7 +627,9 @@ int main(int argc, char ** argv) { ctx_train.build_v_diff(); // run PCA - PCA::run_pca(ctx_train.v_diff, ctx_train.v_final); + PCA::pca_params pca_params; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!! // write output vectors to gguf export_gguf(ctx_train.v_final, cparams.outfile, model_hint); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index a7c76e561..67b914a34 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -20,8 +20,9 @@ #define DEBUG_POS 5 -static void print_debug_tensor(struct ggml_tensor * t) { +static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]); + if (!with_data) return; printf("%s: %s[0] = [", __func__, t->name); for (size_t i = 0; i <= DEBUG_POS; i++) { printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); @@ -31,79 +32,121 @@ static void print_debug_tensor(struct ggml_tensor * t) { namespace PCA { -struct pca_model { - struct ggml_tensor * v_diff_original; - struct ggml_tensor * square; - struct ggml_tensor * eigenvector; - - ggml_backend_t backend = NULL; - ggml_backend_buffer_t buffer; - struct ggml_context * ctx; +// input params for PCA computations +struct pca_params { + int n_threads = 1; + int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used + int n_iterations = 1000; + float tolerance = 1e-7; }; -void load_pca_model(pca_model & model, struct ggml_tensor * input) { +// result from each iteration +struct pca_result { + std::vector eigenvectors; + std::vector distances; +}; + +struct pca_model { + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; // context to compute graph on target device + struct ggml_context * ctx_host; // host context to store results + + // tensors on target device + struct ggml_tensor * dev_input; + struct ggml_tensor * dev_square; + struct ggml_tensor * dev_eigenvector; + + // tensors to store output data on host + struct ggml_tensor * host_eigenvector; + + pca_model(struct ggml_tensor * t_input) { #ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - model.backend = ggml_backend_cuda_init(0); // init device 0 - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } #endif #ifdef GGML_USE_METAL - fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); - model.backend = ggml_backend_metal_init(); - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } #endif - // if there aren't GPU Backends fallback to CPU backend - if (!model.backend) { - model.backend = ggml_backend_cpu_init(); + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + } + + const int num_tensors = 4; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx = ggml_init(params); + + auto n_samples = t_input->ne[0]; + auto n_embd = t_input->ne[1]; + + dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd); + dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + ggml_set_name(dev_input, "dev_input"); + ggml_set_name(dev_square, "dev_square"); + ggml_set_name(dev_eigenvector, "dev_eigenvector"); + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input)); + + // initialize eigenvector to random normalized vector + { + std::vector random_vec(ggml_nelements(dev_eigenvector), 0.0); + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + float sum_sqr = 0.0; // for normalizing random_vec + for (size_t i = 0; i < random_vec.size(); ++i) { + float f = distribution(generator); + sum_sqr += f * f; + random_vec[i] = f; + } + // normalize it + float random_vec_norm = std::sqrt(sum_sqr); + for (size_t i = 0; i < random_vec.size(); ++i) { + random_vec[i] /= random_vec_norm; + } + ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector)); + } + + // init host context + struct ggml_init_params host_params = { + /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + ctx_host = ggml_init(host_params); + host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd); } - const int num_tensors = 4; - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - model.ctx = ggml_init(params); - - auto n_embd = input->ne[1]; - auto n_samples = input->ne[0]; - - model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_samples, n_embd); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); - model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); - - ggml_set_name(model.v_diff_original, "v_diff_original"); - ggml_set_name(model.square, "square"); - ggml_set_name(model.eigenvector, "eigenvector"); - - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); - - ggml_backend_tensor_set(model.v_diff_original, input->data, 0, ggml_nbytes(input)); - - // initialize model.eigenvector to random vector - std::vector random_vec; - std::default_random_engine generator(static_cast(std::time(0))); - std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < ggml_nelements(model.eigenvector); ++i) { - random_vec.push_back(distribution(generator)); + ~pca_model() { + ggml_free(ctx_host); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); } - - // we don't normalize it at first but that shouldn't be a problem - ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); -} +}; static struct ggml_cgraph * build_graph_piter( + const struct pca_params & params, const pca_model & model, - bool calc_square = false, - int nb_iterations = 1) { - GGML_ASSERT(nb_iterations > 0); + bool calc_square = false) { + GGML_ASSERT(params.n_batch > 0); + // TODO: buf_size must be able to scale with params.n_batch static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); @@ -117,20 +160,21 @@ static struct ggml_cgraph * build_graph_piter( struct ggml_cgraph * gf = ggml_new_graph(ctx0); // turn v_diff_original into square matrix if needed - struct ggml_tensor * square; + struct ggml_tensor * tmp_square; if (calc_square) { - //struct ggml_tensor * v_diff_transposed = ggml_transpose(ctx0, model.v_diff_original); - print_debug_tensor(model.v_diff_original); - square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); - ggml_set_name(square, "square"); - //model.square = ggml_scale_inplace(ctx0, model.square, 0.0); + print_debug_tensor(model.dev_input); + tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input); + ggml_set_name(tmp_square, "tmp_square"); } struct ggml_tensor * b_tensor; + struct ggml_tensor * distance; + struct ggml_tensor * old_eigen = model.dev_eigenvector; + struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square; - for (int i = 0; i < nb_iterations; ++i) { + for (int i = 0; i < params.n_batch; ++i) { // b_tensor = square * eigenvector^T - b_tensor = ggml_mul_mat(ctx0, square, model.eigenvector); + b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen); ggml_set_name(b_tensor, "b_tensor"); // normalize @@ -138,104 +182,122 @@ static struct ggml_cgraph * build_graph_piter( b_tensor, ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) ); - } + ggml_set_name(b_tensor, ("b_tensor_norm_" + std::to_string(i)).c_str()); - // calculate distance - struct ggml_tensor * distance; - { - distance = ggml_sub(ctx0, model.eigenvector, b_tensor); - ggml_set_name(distance, "distance"); + // calculate distance(new eigenvector - old eigenvector) + struct ggml_tensor * new_sub_old = ggml_sub(ctx0, old_eigen, b_tensor); distance = ggml_sqrt_inplace(ctx0, - ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, distance))); - } + ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); + ggml_set_name(distance, ("distance_" + std::to_string(i)).c_str()); - // build operations nodes - ggml_build_forward_expand(gf, distance); + old_eigen = b_tensor; + + // build operations nodes + ggml_build_forward_expand(gf, distance); + } // delete the temporally context used to build the graph ggml_free(ctx0); return gf; } -struct ggml_tensor * compute_piter( +static ggml_status compute_piter( + const struct pca_params & params, const pca_model & model, struct ggml_cgraph * gf, ggml_gallocr_t allocr, - int n_threads) { + struct pca_result & result) { // allocate tensors ggml_gallocr_alloc_graph(allocr, gf); if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); + ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } #ifdef GGML_USE_METAL if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); + ggml_backend_metal_set_n_cb(model.backend, params.n_threads); } #endif - ggml_backend_graph_compute(model.backend, gf); - - // in this case, the output tensor is the last one in the graph - return gf->nodes[gf->n_nodes - 1]; + ggml_status res = ggml_backend_graph_compute(model.backend, gf); + if (res == GGML_STATUS_SUCCESS) { + auto extract_i = [](std::string prefix, std::string str) -> int { + int i = -1; + if (str.rfind(prefix, 0) == 0) { + sscanf(str.c_str(), (prefix + "%d").c_str(), &i); + } + return i; + }; + // get output nodes + result.eigenvectors.clear(); + result.distances.clear(); + result.eigenvectors.resize(params.n_batch); + result.distances.resize(params.n_batch); + for (int i = 0; i < gf->n_nodes; ++i) { + auto node = gf->nodes[i]; + int iter = -1; + // find b_tensor (without copying data from device) + if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { + print_debug_tensor(node, false); + result.eigenvectors[iter] = node; + } + // find distances, then copy data from device + if ((iter = extract_i("distance_", node->name)) > -1) { + float d; + ggml_backend_tensor_get(node, &d, 0, sizeof(float)); + result.distances[iter] = d; + std::cout << node->name << " = " << d << "\n"; + } + } + } + return res; } static void power_iteration( - struct ggml_tensor * input, - struct ggml_tensor * output, - int n_threads, - int maxIterations = 1000, - float tolerance = 1e-7) { + const struct pca_params & params, + struct ggml_tensor * input, // shape of input: [n_samples, n_embd] + struct ggml_tensor * output) { printf("in power iteration\n"); - int n_embd = input->ne[0]; // shape of input: [n_embd, m] - - pca_model model; - load_pca_model(model, input); + //int n_embd = input->ne[1]; + struct pca_model model(input); ggml_gallocr_t allocr = NULL; + struct pca_result result; + struct ggml_tensor * last_eigenvector; - struct ggml_init_params host_params = { - /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 4u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * host_ctx = ggml_init(host_params); - - struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); - struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); - - for (int iter = 0; iter < maxIterations; ++iter) { + int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations + for (int iter = 0; iter < n_iter; ++iter) { + bool calc_square = (iter == 0); // only need to calculate square for first iteration if (allocr) { ggml_gallocr_free(allocr); } allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - struct ggml_cgraph * gf = build_graph_piter(model, iter == 0); + struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); - struct ggml_tensor * distance = compute_piter(model, gf, allocr, n_threads); + compute_piter(params, model, gf, allocr, result); - ggml_backend_tensor_get(distance, host_new_eigenvector->data, 0, ggml_nbytes(distance)); - print_debug_tensor(host_new_eigenvector); + for (size_t k = 0; k < result.distances.size(); ++k) { + last_eigenvector = result.eigenvectors[k]; + if (result.distances[k] < params.tolerance) { + break; // done + } + } break; // FIXME } - ggml_backend_tensor_get(model.eigenvector, output->data, 0, ggml_nbytes(model.eigenvector)); - + ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); + print_debug_tensor(output); ggml_gallocr_free(allocr); - ggml_free(host_ctx); - ggml_free(model.ctx); - ggml_backend_buffer_free(model.buffer); - ggml_backend_free(model.backend); - exit(0); } static void run_pca( + const struct pca_params & params, const std::vector & v_input, const std::vector & v_output) { printf("Running PCA...\n"); int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] - int n_threads = 8; // TODO: change me for (size_t il = 0; il < v_input.size(); ++il) { print_debug_tensor(v_input[il]); // prepare output vector @@ -243,7 +305,7 @@ static void run_pca( auto name = std::string("direction.") + std::to_string(il + 1); ggml_set_name(ctrl_out, name.c_str()); // run power_iteration - power_iteration(v_input[il], ctrl_out, n_threads); + power_iteration(params, v_input[il], ctrl_out); printf("Done with layer %d\n", il); print_debug_tensor(ctrl_out); }