ggml : remove ggml_cplan + rework ggml_cgraph

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-09-11 13:05:10 +03:00
parent ee154457dd
commit 119e0bc9ae
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
10 changed files with 248 additions and 175 deletions

View file

@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = 5e-6f;
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
) {
@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;
std::vector<uint8_t> work_buffer;
for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
float error_before_opt = ggml_get_f32_1d(e, 0);
@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);

View file

@ -20,17 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
if (tensor->type == GGML_TYPE_F32) {
@ -179,9 +168,8 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
std::vector<uint8_t> work_buffer;
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
ggml_graph_work_init(gf, nullptr);
TENSOR_DUMP(ggml_graph_node(gf, 0));
@ -234,7 +222,7 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
ggml_graph_compute(gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
@ -267,8 +255,11 @@ int main(int argc, char ** argv) {
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
ggml_graph_compute(gf32);
}
ggml_graph_work_free(gf);
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");

View file

@ -183,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, model.ctx);
ggml_graph_compute(gf);
struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context