add n_batch for pca

This commit is contained in:
ngxson 2024-06-11 11:45:16 +02:00
parent 6a5adf3d7c
commit 9e39571fc2
2 changed files with 187 additions and 121 deletions

View file

@ -188,7 +188,9 @@ struct train_context {
for (int il = 0; il < n_layers - 1; il++) { for (int il = 0; il < n_layers - 1; il++) {
std::vector<uint8_t> empty; std::vector<uint8_t> empty;
v_diff_tmp.push_back(empty); v_diff_tmp.push_back(empty);
v_final.push_back(ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd)); auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
v_final.push_back(t);
} }
} }
@ -625,7 +627,9 @@ int main(int argc, char ** argv) {
ctx_train.build_v_diff(); ctx_train.build_v_diff();
// run PCA // run PCA
PCA::run_pca(ctx_train.v_diff, ctx_train.v_final); PCA::pca_params pca_params;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!!
// write output vectors to gguf // write output vectors to gguf
export_gguf(ctx_train.v_final, cparams.outfile, model_hint); export_gguf(ctx_train.v_final, cparams.outfile, model_hint);

View file

@ -20,8 +20,9 @@
#define DEBUG_POS 5 #define DEBUG_POS 5
static void print_debug_tensor(struct ggml_tensor * t) { static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]); printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]);
if (!with_data) return;
printf("%s: %s[0] = [", __func__, t->name); printf("%s: %s[0] = [", __func__, t->name);
for (size_t i = 0; i <= DEBUG_POS; i++) { for (size_t i = 0; i <= DEBUG_POS; i++) {
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
@ -31,21 +32,39 @@ static void print_debug_tensor(struct ggml_tensor * t) {
namespace PCA { namespace PCA {
struct pca_model { // input params for PCA computations
struct ggml_tensor * v_diff_original; struct pca_params {
struct ggml_tensor * square; int n_threads = 1;
struct ggml_tensor * eigenvector; int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used
int n_iterations = 1000;
ggml_backend_t backend = NULL; float tolerance = 1e-7;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx;
}; };
void load_pca_model(pca_model & model, struct ggml_tensor * input) { // result from each iteration
struct pca_result {
std::vector<struct ggml_tensor *> eigenvectors;
std::vector<float> distances;
};
struct pca_model {
ggml_backend_t backend = NULL;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx; // context to compute graph on target device
struct ggml_context * ctx_host; // host context to store results
// tensors on target device
struct ggml_tensor * dev_input;
struct ggml_tensor * dev_square;
struct ggml_tensor * dev_eigenvector;
// tensors to store output data on host
struct ggml_tensor * host_eigenvector;
pca_model(struct ggml_tensor * t_input) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__); fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0); // init device 0 backend = ggml_backend_cuda_init(0); // init device 0
if (!model.backend) { if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
} }
#endif #endif
@ -53,15 +72,15 @@ void load_pca_model(pca_model & model, struct ggml_tensor * input) {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__); fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init(); backend = ggml_backend_metal_init();
if (!model.backend) { if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
} }
#endif #endif
// if there aren't GPU Backends fallback to CPU backend // if there aren't GPU Backends fallback to CPU backend
if (!model.backend) { if (!backend) {
model.backend = ggml_backend_cpu_init(); backend = ggml_backend_cpu_init();
} }
const int num_tensors = 4; const int num_tensors = 4;
@ -70,40 +89,64 @@ void load_pca_model(pca_model & model, struct ggml_tensor * input) {
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
model.ctx = ggml_init(params); ctx = ggml_init(params);
auto n_embd = input->ne[1]; auto n_samples = t_input->ne[0];
auto n_samples = input->ne[0]; auto n_embd = t_input->ne[1];
model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_samples, n_embd); dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
ggml_set_name(model.v_diff_original, "v_diff_original"); ggml_set_name(dev_input, "dev_input");
ggml_set_name(model.square, "square"); ggml_set_name(dev_square, "dev_square");
ggml_set_name(model.eigenvector, "eigenvector"); ggml_set_name(dev_eigenvector, "dev_eigenvector");
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); // initialize eigenvector to random normalized vector
{
ggml_backend_tensor_set(model.v_diff_original, input->data, 0, ggml_nbytes(input)); std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
// initialize model.eigenvector to random vector
std::vector<float> random_vec;
std::default_random_engine generator(static_cast<unsigned int>(std::time(0))); std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
std::uniform_real_distribution<float> distribution(0.0, 1.0); std::uniform_real_distribution<float> distribution(0.0, 1.0);
for (int i = 0; i < ggml_nelements(model.eigenvector); ++i) { float sum_sqr = 0.0; // for normalizing random_vec
random_vec.push_back(distribution(generator)); for (size_t i = 0; i < random_vec.size(); ++i) {
float f = distribution(generator);
sum_sqr += f * f;
random_vec[i] = f;
}
// normalize it
float random_vec_norm = std::sqrt(sum_sqr);
for (size_t i = 0; i < random_vec.size(); ++i) {
random_vec[i] /= random_vec_norm;
}
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
} }
// we don't normalize it at first but that shouldn't be a problem // init host context
ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); struct ggml_init_params host_params = {
/*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
ctx_host = ggml_init(host_params);
host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd);
} }
~pca_model() {
ggml_free(ctx_host);
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
};
static struct ggml_cgraph * build_graph_piter( static struct ggml_cgraph * build_graph_piter(
const struct pca_params & params,
const pca_model & model, const pca_model & model,
bool calc_square = false, bool calc_square = false) {
int nb_iterations = 1) { GGML_ASSERT(params.n_batch > 0);
GGML_ASSERT(nb_iterations > 0); // TODO: buf_size must be able to scale with params.n_batch
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size); static std::vector<uint8_t> buf(buf_size);
@ -117,20 +160,21 @@ static struct ggml_cgraph * build_graph_piter(
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// turn v_diff_original into square matrix if needed // turn v_diff_original into square matrix if needed
struct ggml_tensor * square; struct ggml_tensor * tmp_square;
if (calc_square) { if (calc_square) {
//struct ggml_tensor * v_diff_transposed = ggml_transpose(ctx0, model.v_diff_original); print_debug_tensor(model.dev_input);
print_debug_tensor(model.v_diff_original); tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); ggml_set_name(tmp_square, "tmp_square");
ggml_set_name(square, "square");
//model.square = ggml_scale_inplace(ctx0, model.square, 0.0);
} }
struct ggml_tensor * b_tensor; struct ggml_tensor * b_tensor;
struct ggml_tensor * distance;
struct ggml_tensor * old_eigen = model.dev_eigenvector;
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
for (int i = 0; i < nb_iterations; ++i) { for (int i = 0; i < params.n_batch; ++i) {
// b_tensor = square * eigenvector^T // b_tensor = square * eigenvector^T
b_tensor = ggml_mul_mat(ctx0, square, model.eigenvector); b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
ggml_set_name(b_tensor, "b_tensor"); ggml_set_name(b_tensor, "b_tensor");
// normalize // normalize
@ -138,104 +182,122 @@ static struct ggml_cgraph * build_graph_piter(
b_tensor, b_tensor,
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
); );
} ggml_set_name(b_tensor, ("b_tensor_norm_" + std::to_string(i)).c_str());
// calculate distance // calculate distance(new eigenvector - old eigenvector)
struct ggml_tensor * distance; struct ggml_tensor * new_sub_old = ggml_sub(ctx0, old_eigen, b_tensor);
{
distance = ggml_sub(ctx0, model.eigenvector, b_tensor);
ggml_set_name(distance, "distance");
distance = ggml_sqrt_inplace(ctx0, distance = ggml_sqrt_inplace(ctx0,
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, distance))); ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
} ggml_set_name(distance, ("distance_" + std::to_string(i)).c_str());
old_eigen = b_tensor;
// build operations nodes // build operations nodes
ggml_build_forward_expand(gf, distance); ggml_build_forward_expand(gf, distance);
}
// delete the temporally context used to build the graph // delete the temporally context used to build the graph
ggml_free(ctx0); ggml_free(ctx0);
return gf; return gf;
} }
struct ggml_tensor * compute_piter( static ggml_status compute_piter(
const struct pca_params & params,
const pca_model & model, const pca_model & model,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
ggml_gallocr_t allocr, ggml_gallocr_t allocr,
int n_threads) { struct pca_result & result) {
// allocate tensors // allocate tensors
ggml_gallocr_alloc_graph(allocr, gf); ggml_gallocr_alloc_graph(allocr, gf);
if (ggml_backend_is_cpu(model.backend)) { if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads); ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
} }
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (ggml_backend_is_metal(model.backend)) { if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_cb(model.backend, n_threads); ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
} }
#endif #endif
ggml_backend_graph_compute(model.backend, gf); ggml_status res = ggml_backend_graph_compute(model.backend, gf);
if (res == GGML_STATUS_SUCCESS) {
// in this case, the output tensor is the last one in the graph auto extract_i = [](std::string prefix, std::string str) -> int {
return gf->nodes[gf->n_nodes - 1]; int i = -1;
if (str.rfind(prefix, 0) == 0) {
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
}
return i;
};
// get output nodes
result.eigenvectors.clear();
result.distances.clear();
result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch);
for (int i = 0; i < gf->n_nodes; ++i) {
auto node = gf->nodes[i];
int iter = -1;
// find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
print_debug_tensor(node, false);
result.eigenvectors[iter] = node;
}
// find distances, then copy data from device
if ((iter = extract_i("distance_", node->name)) > -1) {
float d;
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
result.distances[iter] = d;
std::cout << node->name << " = " << d << "\n";
}
}
}
return res;
} }
static void power_iteration( static void power_iteration(
struct ggml_tensor * input, const struct pca_params & params,
struct ggml_tensor * output, struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
int n_threads, struct ggml_tensor * output) {
int maxIterations = 1000,
float tolerance = 1e-7) {
printf("in power iteration\n"); printf("in power iteration\n");
int n_embd = input->ne[0]; // shape of input: [n_embd, m] //int n_embd = input->ne[1];
struct pca_model model(input);
pca_model model;
load_pca_model(model, input);
ggml_gallocr_t allocr = NULL; ggml_gallocr_t allocr = NULL;
struct pca_result result;
struct ggml_tensor * last_eigenvector;
struct ggml_init_params host_params = { int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations
/*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 4u, for (int iter = 0; iter < n_iter; ++iter) {
/*.mem_buffer =*/ NULL, bool calc_square = (iter == 0); // only need to calculate square for first iteration
/*.no_alloc =*/ false,
};
struct ggml_context * host_ctx = ggml_init(host_params);
struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd);
struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd);
for (int iter = 0; iter < maxIterations; ++iter) {
if (allocr) { if (allocr) {
ggml_gallocr_free(allocr); ggml_gallocr_free(allocr);
} }
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
struct ggml_cgraph * gf = build_graph_piter(model, iter == 0); struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
struct ggml_tensor * distance = compute_piter(model, gf, allocr, n_threads); compute_piter(params, model, gf, allocr, result);
ggml_backend_tensor_get(distance, host_new_eigenvector->data, 0, ggml_nbytes(distance)); for (size_t k = 0; k < result.distances.size(); ++k) {
print_debug_tensor(host_new_eigenvector); last_eigenvector = result.eigenvectors[k];
if (result.distances[k] < params.tolerance) {
break; // done
}
}
break; // FIXME break; // FIXME
} }
ggml_backend_tensor_get(model.eigenvector, output->data, 0, ggml_nbytes(model.eigenvector)); ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
print_debug_tensor(output);
ggml_gallocr_free(allocr); ggml_gallocr_free(allocr);
ggml_free(host_ctx);
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);
ggml_backend_free(model.backend);
exit(0);
} }
static void run_pca( static void run_pca(
const struct pca_params & params,
const std::vector<struct ggml_tensor *> & v_input, const std::vector<struct ggml_tensor *> & v_input,
const std::vector<struct ggml_tensor *> & v_output) { const std::vector<struct ggml_tensor *> & v_output) {
printf("Running PCA...\n"); printf("Running PCA...\n");
int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m]
int n_threads = 8; // TODO: change me
for (size_t il = 0; il < v_input.size(); ++il) { for (size_t il = 0; il < v_input.size(); ++il) {
print_debug_tensor(v_input[il]); print_debug_tensor(v_input[il]);
// prepare output vector // prepare output vector
@ -243,7 +305,7 @@ static void run_pca(
auto name = std::string("direction.") + std::to_string(il + 1); auto name = std::string("direction.") + std::to_string(il + 1);
ggml_set_name(ctrl_out, name.c_str()); ggml_set_name(ctrl_out, name.c_str());
// run power_iteration // run power_iteration
power_iteration(v_input[il], ctrl_out, n_threads); power_iteration(params, v_input[il], ctrl_out);
printf("Done with layer %d\n", il); printf("Done with layer %d\n", il);
print_debug_tensor(ctrl_out); print_debug_tensor(ctrl_out);
} }