diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 785e7e886..5d66089b1 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; for (int ex=0; ex 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, e); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e7d75c9ae..840f4fe52 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -164,15 +164,15 @@ int main(int argc, char ** argv) { TENSOR_DUMP(m11); TENSOR_DUMP(m2); - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; { - auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } TENSOR_DUMP(gf.nodes[0]); @@ -228,12 +228,12 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); { - auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads); + if (pf31.work_size > 0) { + work_buffer.resize(pf31.work_size); + pf31.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf31); + ggml_graph_compute(&gf31, &pf31); } long long int stop = ggml_time_us(); @@ -268,12 +268,12 @@ int main(int argc, char ** argv) { // Running a different graph computation to make sure we override the CPU cache lines { - auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads); + if (pf32.work_size > 0) { + work_buffer.resize(pf32.work_size); + pf32.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf32); + ggml_graph_compute(&gf32, &pf32); } } printf("\n"); diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index cdfe4bfe9..7438defde 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -35,10 +35,9 @@ int main(int argc, char ** argv) { struct ggml_context * ctx_eval = NULL; struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - gf.n_threads = 1; // this allocates all Metal resources and memory buffers - auto * ctx_metal = ggml_metal_init(); + auto * ctx_metal = ggml_metal_init(1); const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 0345b8dc0..11ffbe2e1 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) { printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); + // TODO: use std::vector intead of "new" size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); uint8_t * compute_addr = new uint8_t[compute_size]; @@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; printf("%s: begin training\n", __func__); @@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) { } { - auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, gf); + ggml_graph_compute(gf, &pf); } size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) { model.train_tokens += n_batch * n_tokens; { - auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, gf); + ggml_graph_compute(gf, &pf); } float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); @@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); llama_free_model(lmodel); ggml_free(model.ctx); diff --git a/ggml-metal.h b/ggml-metal.h index b9e50ac74..928f1705c 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -34,9 +34,13 @@ extern "C" { struct ggml_metal_context; -struct ggml_metal_context * ggml_metal_init(void); +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels diff --git a/ggml-metal.m b/ggml-metal.m index fd69c41fe..3f15f791f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -25,6 +25,8 @@ struct ggml_metal_buffer { }; struct ggml_metal_context { + int n_cb; + float * logits; id device; @@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal"; @implementation GGMLMetalClass @end -struct ggml_metal_context * ggml_metal_init(void) { +struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = n_cb; +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -354,7 +361,7 @@ void ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_cb = gf->n_threads; + const int n_cb = ctx->n_cb; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; diff --git a/ggml.c b/ggml.c index 94a710706..23938fc5f 100644 --- a/ggml.c +++ b/ggml.c @@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { const struct ggml_cgraph * cgraph; - const struct ggml_graph_compute_plan * plan; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; - const struct ggml_graph_compute_plan * plan = state->shared->plan; - const int * n_tasks_arr = plan->n_tasks; + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; - const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; @@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ plan->work_size, - /*.wdata =*/ plan->work_data, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { @@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, - /*.wsize =*/ plan->work_size, - /*.wdata =*/ plan->work_data, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (state->ith < n_tasks) { @@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } // Prepare for graph computing. -struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } - struct ggml_graph_compute_plan plan; - memset(&plan, 0, sizeof(struct ggml_graph_compute_plan)); - int * n_tasks = plan.n_tasks; + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); + + int * n_tasks = cplan.n_tasks; + size_t work_size = 0; // initialize tasks + work buffer @@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * work_size += CACHE_LINE_SIZE*(n_threads - 1); } - plan.n_threads = n_threads; - plan.work_size = work_size; - plan.work_data = NULL; + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; - return plan; + return cplan; } -void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) { +void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { - GGML_ASSERT(plan); - GGML_ASSERT(plan->n_threads > 0); + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); - if (plan->work_size > 0) { - GGML_ASSERT(plan->work_data); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); } for (int i = 0; i < cgraph->n_nodes; ++i) { if (cgraph->nodes[i]->op != GGML_OP_NONE) { - GGML_ASSERT(plan->n_tasks[i] > 0); + GGML_ASSERT(cplan->n_tasks[i] > 0); } } } - const int n_threads = plan->n_threads; + const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ plan, + /*.cgraph_plan =*/ cplan, /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, @@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap } // TODO: avoid allocating memory frequently. -static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); +// TODO: make part of public API - use different name and put warning that it makes allocations +static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + if (cplan.work_size > 0) { + cplan.work_data = malloc(cplan.work_size); + GGML_ASSERT(cplan.work_data); } - ggml_graph_compute(&plan, cgraph); + ggml_graph_compute(cgraph, &cplan); - if (plan.work_data) { - free(plan.work_data); + if (cplan.work_data) { + free(cplan.work_data); } } @@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params->n_threads); + ggml_graph_compute_helper(gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); ggml_opt_get_grad(np, ps, g); diff --git a/ggml.h b/ggml.h index 1b50ab866..901c701ea 100644 --- a/ggml.h +++ b/ggml.h @@ -443,17 +443,15 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // The default graph compute plan that needs to be prepared for ggml_graph_compute(). - // Since https://github.com/ggerganov/ggml/issues/287 - struct ggml_graph_compute_plan { - // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. - size_t work_size; - // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`. - uint8_t * work_data; + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; - // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes. + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes int n_tasks[GGML_MAX_NODES]; }; @@ -1313,11 +1311,11 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute(). - // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data. - GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/); - GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/llama.cpp b/llama.cpp index c29d46d8d..e68beb7c5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -321,9 +321,8 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; - // reusable buffer for `struct ggml_graph_compute_plan.work_data` - // std::vector guarantees the elements are stored contiguously. - std::vector compute_plan_buffer; + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; // memory buffers used to evaluate the model // TODO: move in llama_state @@ -1599,6 +1598,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); call_ggml_graph_compute = false; @@ -1622,12 +1622,12 @@ static bool llama_eval_internal( #endif if (call_ggml_graph_compute) { - auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); - if (plan.work_size > 0) { - lctx.compute_plan_buffer.resize(plan.work_size); - plan.work_data = lctx.compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads); + if (pf.work_size > 0) { + lctx.work_buffer.resize(pf.work_size); + pf.work_data = lctx.work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } if (cgraph_fname) { @@ -2587,8 +2587,8 @@ void llama_free_model(struct llama_model * model) { } struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { + struct llama_model * model, + struct llama_context_params params) { if (!model) { return nullptr; @@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(); + ctx->ctx_metal = ggml_metal_init(1); void * data_ptr = NULL; size_t data_size = 0; @@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const bool warned = false; int n_tensors = 0; - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; while (true) { int32_t n_dims; @@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); { - auto plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } // we won't need these tensors again, reset the context to save memory @@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); { - auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - ctx->compute_plan_buffer.resize(plan.work_size); - plan.work_data = ctx->compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + ctx->work_buffer.resize(pf.work_size); + pf.work_data = ctx->work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } ggml_free(cpy_ctx); @@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); { - auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - ctx->compute_plan_buffer.resize(plan.work_size); - plan.work_data = ctx->compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + ctx->work_buffer.resize(pf.work_size); + pf.work_data = ctx->work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } ggml_free(cpy_ctx); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c..dd989c5c0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -# llama_add_test(test-grad0.c) # SLOW -# llama_add_test(test-opt.c) # SLOW +llama_add_test(test-grad0.c) # SLOW +llama_add_test(test-opt.c) # SLOW diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 548547727..9c27e603e 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -10,6 +10,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#pragma GCC diagnostic ignored "-Wdouble-promotion" + #define MAX_NARGS 3 #undef MIN @@ -49,7 +51,7 @@ float frand(void) { int irand(int n) { if (n == 0) return 0; - else return rand()%n; + return rand()%n; } void get_random_dims(int64_t * dims, int ndims) { @@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int( float get_element(const struct ggml_tensor * t, int idx) { if (t->type == GGML_TYPE_F32) { return ((float *)t->data)[idx]; - } else if (t->type == GGML_TYPE_I32) { - return ((int32_t *)t->data)[idx]; - } else { - assert(false); - return INFINITY; } + + if (t->type == GGML_TYPE_I32) { + return ((int32_t *)t->data)[idx]; + } + + assert(false); + return INFINITY; } void set_element(struct ggml_tensor * t, int idx, float value) { @@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } -struct compute_plan_buffer { +struct work_buffer { size_t size; uint8_t * data; }; -static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { +static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { if (size == 0) { return NULL; } @@ -241,20 +245,19 @@ bool check_gradient( } struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + if (pf.work_size > 0) { + pf.work_data = malloc(pf.work_size); + GGML_ASSERT(pf.work_data); } - ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); + ggml_graph_compute(&gf, &pf); + if (pf.work_data) { + free(pf.work_data); } } @@ -262,9 +265,9 @@ bool check_gradient( ggml_set_f32 (f->grad, 1.0f); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gb); + struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); @@ -280,9 +283,9 @@ bool check_gradient( set_element(x[i], k, xp); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gf); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float f0 = ggml_get_f32_1d(f, 0); @@ -290,9 +293,9 @@ bool check_gradient( set_element(x[i], k, xm); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gf); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float f1 = ggml_get_f32_1d(f, 0); @@ -306,15 +309,15 @@ bool check_gradient( ggml_set_f32 (f->grad, 1.0f); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gb); + struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float g1 = get_element(x[i]->grad, k); const float error_abs = fabsf(g0 - g1); - const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; if (error_abs > max_error_abs || error_rel > max_error_rel) { printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", @@ -325,8 +328,8 @@ bool check_gradient( } } - if (plan_buf.data) { - free(plan_buf.data); + if (buf.data) { + free(buf.data); } return true; diff --git a/tests/test-opt.c b/tests/test-opt.c index 35d070dc7..3ed246b3b 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -7,6 +7,7 @@ #define MAX_NARGS 2 +#pragma GCC diagnostic ignored "-Wdouble-promotion" // // logging @@ -33,7 +34,7 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand() { +float frand(void) { return (float)rand()/(float)RAND_MAX; } @@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) { } -struct compute_plan_buffer { +struct work_buffer { size_t size; uint8_t * data; }; -static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { +static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { if (size == 0) { return NULL; } @@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s return buf->data; } -int main(int argc, const char ** argv) { +int main(void) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, .mem_buffer = NULL, @@ -166,11 +167,11 @@ int main(int argc, const char ** argv) { struct ggml_cgraph ge = ggml_build_forward(e); ggml_graph_reset (&ge); - struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &ge); + struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); + pe.work_data = work_buffer_resize(&buf, pe.work_size); + ggml_graph_compute(&ge, &pe); } const float fe = ggml_get_f32_1d(e, 0); @@ -183,13 +184,13 @@ int main(int argc, const char ** argv) { ggml_graph_reset (&ge); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &ge); + struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); + pe.work_data = work_buffer_resize(&buf, pe.work_size); + ggml_graph_compute(&ge, &pe); } - if (plan_buf.data) { - free(plan_buf.data); + if (buf.data) { + free(buf.data); } const float fe_opt = ggml_get_f32_1d(e, 0);