diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 3ce91070b..d786dc94d 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = 5e-6f; #endif -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - static struct ggml_tensor * randomize_tensor( struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax ) { @@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; - std::vector work_buffer; - for (int ex=0; ex & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - static float tensor_sum_elements(const ggml_tensor * tensor) { double sum = 0; if (tensor->type == GGML_TYPE_F32) { @@ -179,11 +168,10 @@ int main(int argc, char ** argv) { TENSOR_DUMP(m11); TENSOR_DUMP(m2); - std::vector work_buffer; + ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr); + ggml_graph_work_init(gf, nullptr); - ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - - TENSOR_DUMP(gf->nodes[0]); + TENSOR_DUMP(ggml_graph_node(gf, 0)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); @@ -224,7 +212,7 @@ int main(int argc, char ** argv) { // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); + float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("=====================================================================================\n"); @@ -234,7 +222,7 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); + ggml_graph_compute(gf31); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -252,7 +240,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different - float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); + float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0)); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 @@ -267,8 +255,11 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); + ggml_graph_compute(gf32); } + + ggml_graph_work_free(gf); + printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); printf("=====================================================================================\n"); diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 05c66856c..a969c486d 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -226,8 +226,8 @@ static ggml_status compute_piter( result.eigenvectors.resize(params.n_batch); result.distances.resize(params.n_batch); // get output nodes - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + auto node = ggml_graph_node(gf, i); int iter = -1; // find b_tensor (without copying data from device) if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index ff324926a..90126ad1e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -370,7 +370,7 @@ struct lora_merge_ctx { // write data to output file { - auto result = gf->nodes[gf->n_nodes - 1]; + auto * result = ggml_graph_node(gf, -1); size_t len = ggml_nbytes(result); if (read_buf.size() < len) { read_buf.resize(len); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9b890571e..5dfb333d1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 851af0f00..46413c901 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -183,8 +183,10 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); ggml_build_forward_expand(gf, flatten); - ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + ggml_graph_prepare(gf, 1, nullptr); + ggml_graph_work_init(gf, model.ctx); + ggml_graph_compute(gf); + struct ggml_tensor* result = ggml_graph_node(gf, -1); memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 536018b66..684578f98 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -358,6 +358,7 @@ extern "C" { struct ggml_object; struct ggml_context; + struct ggml_cgraph; // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { @@ -575,23 +576,9 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, }; - // ggml object - struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - enum ggml_object_type type; - - char padding[4]; - }; - - static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - // n-dimensional tensor struct ggml_tensor { - enum ggml_type type; + enum ggml_type type; GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); @@ -655,50 +642,7 @@ extern "C" { struct ggml_threadpool; // forward declaration, see ggml.c - typedef struct ggml_threadpool * ggml_threadpool_t; - - // the compute plan that needs to be prepared for ggml_graph_compute() - // since https://github.com/ggerganov/ggml/issues/287 - struct ggml_cplan { - size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` - uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` - - int n_threads; - struct ggml_threadpool * threadpool; - - // abort ggml_graph_compute when true - ggml_abort_callback abort_callback; - void * abort_callback_data; - }; - - enum ggml_cgraph_eval_order { - GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, - GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, - GGML_CGRAPH_EVAL_ORDER_COUNT - }; - - typedef uint32_t ggml_bitset_t; - - struct ggml_hash_set { - size_t size; - ggml_bitset_t * used; // whether or not the keys are in use i.e. set - struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) - }; - - // computation graph - struct ggml_cgraph { - int size; - int n_nodes; - int n_leafs; - - struct ggml_tensor ** nodes; - struct ggml_tensor ** grads; - struct ggml_tensor ** leafs; - - struct ggml_hash_set visited_hash_set; - - enum ggml_cgraph_eval_order order; - }; + typedef struct ggml_threadpool * ggml_threadpool_t; // scratch buffer struct ggml_scratch { @@ -2017,8 +1961,6 @@ extern "C" { typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); - #define GGML_N_TASKS_MAX -1 - GGML_API struct ggml_tensor * ggml_map_custom1( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2088,42 +2030,92 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); - // graph allocation in a context - GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false - GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads); - GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1); - GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads - GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false + GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads + GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + + GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i] + GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); + GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); + + GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); - GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + // TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c + // (unless the code has been moved to a separate source file) + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); - // ggml_graph_plan() has to be called before ggml_graph_compute() - // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan( - const struct ggml_cgraph * cgraph, - int n_threads, /* = GGML_DEFAULT_N_THREADS */ - struct ggml_threadpool * threadpool /* = NULL */ ); - GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + // ================================================================================================= + // CPU-only API for ggml_cgraph + // + // TODO: move to the CPU backend + // NOTE: avoid using, will be removed + // - // same as ggml_graph_compute() but the work data is allocated as a part of the context - // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + // loops through the graph and determines: + // + // - work size needed for CPU computation + // - number of threads to start + // + GGML_API enum ggml_status ggml_graph_prepare( + struct ggml_cgraph * cgraph, + int n_threads, /* = GGML_DEFAULT_N_THREADS */ + struct ggml_threadpool * threadpool /* = NULL */ ); + + // get the estimated work size for the graph from ggml_graph_prepare() + GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph); + + // if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer + // otherwise, the work buffer will be allocated in the context. no need to free it + GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx); + GGML_API void ggml_graph_work_free(struct ggml_cgraph * cgraph); + + // note: call ggml_graph_prepare() and ggml_graph_work_init() first + // + // sample usages: + // + // - no dynamic allocations: + // + // ... prepare ggml_context ctx ... + // + // ggml_graph_prepare (cgraph, n_threads, threadpool); + // ggml_graph_work_init(cgraph, ctx); + // + // ggml_graph_compute (cgraph); // can call many times + // + // // no need to call ggml_graph_work_free() because it is allocated in ctx + // + // - dynamic allocations: + // + // ggml_graph_prepare (cgraph, n_threads, threadpool); + // ggml_graph_work_init(cgraph, NULL); // will allocate memory + // + // ggml_graph_compute (cgraph); // can call many times + // + // ggml_graph_work_free(cgraph); + // + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph); + + // end of CPU-only API + // ================================================================================================= + + GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); @@ -2146,6 +2138,7 @@ extern "C" { struct ggml_cgraph * gb_tmp, struct ggml_tensor * * checkpoints, int n_checkpoints); + // // optimization // diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index b5d9301a7..4787a0bd1 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_ } struct ggml_backend_plan_cpu { - struct ggml_cplan cplan; + // TODO: move member from ggml_cgraph here when the public CPU-only API is removed + struct ggml_cgraph cgraph; }; @@ -761,19 +762,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy + ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); - if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); - if (cpu_plan->cplan.work_data == NULL) { + if (cpu_plan->cgraph.work_size > 0) { + cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size); + if (cpu_plan->cgraph.work_data == NULL) { free(cpu_plan); return NULL; } } - cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; - cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cpu_plan->cgraph.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data; return cpu_plan; } @@ -781,7 +782,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - free(cpu_plan->cplan.work_data); + free(cpu_plan->cgraph.work_data); free(cpu_plan); GGML_UNUSED(backend); @@ -790,7 +791,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + return ggml_graph_compute(&cpu_plan->cgraph); GGML_UNUSED(backend); } @@ -798,23 +799,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); + ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); - if (cpu_ctx->work_size < cplan.work_size) { + if (cpu_ctx->work_size < cgraph->work_size) { free(cpu_ctx->work_data); - cpu_ctx->work_data = malloc(cplan.work_size); + cpu_ctx->work_data = malloc(cgraph->work_size); if (cpu_ctx->work_data == NULL) { cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } - cpu_ctx->work_size = cplan.work_size; + cpu_ctx->work_size = cgraph->work_size; } - cplan.work_data = cpu_ctx->work_data; + cgraph->work_data = cpu_ctx->work_data; + cgraph->work_own = false; // always freed by ggml_backend_cpu_graph_plan_free - cplan.abort_callback = cpu_ctx->abort_callback; - cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cgraph->abort_callback = cpu_ctx->abort_callback; + cgraph->abort_callback_data = cpu_ctx->abort_callback_data; - return ggml_graph_compute(cgraph, &cplan); + return ggml_graph_compute(cgraph); } GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 713731735..6d99c6bea 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -1,3 +1,4 @@ +#include "ggml-impl.h" #include "ggml-blas.h" #include "ggml-backend-impl.h" diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 06930ba2e..530b700e3 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -30,6 +30,7 @@ #include #include +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index d53de4edd..54f1a7c2d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1,5 +1,5 @@ #include "ggml-cuda.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cuda/common.cuh" diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 961f3c67b..31a46ea3e 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif +enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT +}; + // bitset +typedef uint32_t ggml_bitset_t; + static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) @@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { #define GGML_HASHSET_FULL ((size_t)-1) #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set { + size_t size; + ggml_bitset_t * used; // whether or not the keys are in use i.e. set + struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) +}; + struct ggml_hash_set ggml_hash_set_new(size_t size); void ggml_hash_set_free(struct ggml_hash_set * hash_set); @@ -745,6 +759,37 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g GGML_ABORT("fatal error"); } +// computation graph + +struct ggml_cgraph { + int size; + int n_nodes; + int n_leafs; + + struct ggml_tensor ** nodes; + struct ggml_tensor ** grads; + struct ggml_tensor ** leafs; + + struct ggml_hash_set visited_hash_set; + + enum ggml_cgraph_eval_order order; + + // TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu + + bool work_own; + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + struct ggml_threadpool * threadpool; + + // abort ggml_graph_compute when true + ggml_abort_callback abort_callback; + void * abort_callback_data; +}; + +struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 41ac63fa4..7f0bd82d5 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-kompute.h" diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 6d8a7c898..6c85acfec 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -1,7 +1,7 @@ #import "ggml-metal.h" +#import "ggml-impl.h" #import "ggml-backend-impl.h" -#import "ggml.h" #import @@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_nodes = gf->n_nodes; + const int n_nodes = gf->n_nodes; const int n_cb = ctx->n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 9c600c7ca..a8a2eb85a 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -1,5 +1,5 @@ #include "ggml-rpc.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4f03b01e7..be56f8180 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -33,7 +33,7 @@ #include #include "ggml-sycl.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-sycl/backend.hpp" diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 83737c1d9..bad960510 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -21,7 +21,7 @@ #include #include -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-vulkan-shaders.hpp" diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d7157ca6d..643821b5b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 +#define GGML_N_TASKS_MAX (-1) #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - res = GGML_F32x4_REDUCE_ONE(x[0]); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ } #define GGML_F32_VEC GGML_F32x4 @@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + #define GGML_F16x8_REDUCE(res, x) \ + do { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ + (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ } while (0) #define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) #define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_MUL GGML_F16x8_MUL @@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif +// +// ggml object +// + +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + enum ggml_object_type type; + + char padding[4]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // // ggml context // @@ -1983,7 +2001,6 @@ struct ggml_threadpool { ggml_cond_t cond; // cond.var for waiting for new work struct ggml_cgraph * cgraph; - struct ggml_cplan * cplan; // synchronization primitives atomic_int n_graph; // incremented when there is work to be done (i.e each graph) @@ -19071,14 +19088,21 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz assert(obj_size == (size_t)((char *)p - (char *)cgraph)); *cgraph = (struct ggml_cgraph) { - /*.size =*/ size, - /*.n_nodes =*/ 0, - /*.n_leafs =*/ 0, - /*.nodes =*/ nodes_ptr, - /*.grads =*/ grads_ptr, - /*.leafs =*/ leafs_ptr, - /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, - /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, + /*.size =*/ size, + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.nodes =*/ nodes_ptr, + /*.grads =*/ grads_ptr, + /*.leafs =*/ leafs_ptr, + /*.visited_hash_set =*/ { hash_size, hash_used, hash_keys_ptr }, + /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, + /*.work_own =*/ false, + /*.work_size =*/ 0, + /*.work_data =*/ NULL, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, + /*.threadpool =*/ NULL, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, }; ggml_hash_set_reset(&cgraph->visited_hash_set); @@ -19092,14 +19116,21 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) { struct ggml_cgraph cgraph = { - /*.size =*/ 0, - /*.n_nodes =*/ i1 - i0, - /*.n_leafs =*/ 0, - /*.nodes =*/ cgraph0->nodes + i0, - /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, - /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL, NULL }, - /*.order =*/ cgraph0->order, + /*.size =*/ 0, + /*.n_nodes =*/ i1 - i0, + /*.n_leafs =*/ 0, + /*.nodes =*/ cgraph0->nodes + i0, + /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, + /*.leafs =*/ NULL, + /*.hash_table =*/ { 0, NULL, NULL }, + /*.order =*/ cgraph0->order, + /*.work_own =*/ false, + /*.work_size =*/ 0, + /*.work_data =*/ NULL, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, + /*.threadpool =*/ NULL, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, }; return cgraph; @@ -19161,6 +19192,33 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { ggml_hash_set_reset(&cgraph->visited_hash_set); } +int ggml_graph_size(struct ggml_cgraph * cgraph) { + return cgraph->size; +} + +struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) { + if (i < 0) { + GGML_ASSERT(cgraph->n_nodes + i >= 0); + return cgraph->nodes[cgraph->n_nodes + i]; + } + + GGML_ASSERT(i < cgraph->n_nodes); + return cgraph->nodes[i]; +} + +struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) { + return cgraph->nodes; +} + +int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { + return cgraph->n_nodes; +} + +void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + cgraph->nodes[cgraph->n_nodes] = tensor; + cgraph->n_nodes++; +} + // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -19708,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { #endif } -struct ggml_cplan ggml_graph_plan( - const struct ggml_cgraph * cgraph, - int n_threads, - struct ggml_threadpool * threadpool) { - +enum ggml_status ggml_graph_prepare( + struct ggml_cgraph * cgraph, + int n_threads, + struct ggml_threadpool * threadpool) { if (threadpool == NULL) { GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); } @@ -19722,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan( size_t work_size = 0; - struct ggml_cplan cplan; - memset(&cplan, 0, sizeof(struct ggml_cplan)); - int max_tasks = 1; // thread scheduling for the different operations + work buffer size estimation @@ -19876,28 +19930,63 @@ struct ggml_cplan ggml_graph_plan( work_size += CACHE_LINE_SIZE*(n_threads); } - cplan.threadpool = threadpool; - cplan.n_threads = MIN(max_tasks, n_threads); - cplan.work_size = work_size; - cplan.work_data = NULL; + cgraph->threadpool = threadpool; + cgraph->n_threads = MIN(max_tasks, n_threads); + cgraph->work_size = work_size; - return cplan; + ggml_graph_work_free(cgraph); + + return GGML_STATUS_SUCCESS; +} + +size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) { + return cgraph->work_size; +} + +enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) { + GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first"); + + ggml_graph_work_free(cgraph); + + if (cgraph->work_size > 0) { + if (ctx == NULL) { + cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size); + if (cgraph->work_data == NULL) { + return GGML_STATUS_ALLOC_FAILED; + } + + cgraph->work_own = true; + } else { + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size); + + cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + cgraph->work_own = false; + } + } + + return GGML_STATUS_SUCCESS; +} + +void ggml_graph_work_free(struct ggml_cgraph * cgraph) { + if (cgraph->work_data && cgraph->work_own) { + GGML_ALIGNED_FREE(cgraph->work_data); + cgraph->work_data = NULL; + } } static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; const struct ggml_cgraph * cgraph = state->threadpool->cgraph; - const struct ggml_cplan * cplan = state->threadpool->cplan; set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ state->threadpool->n_threads_cur, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.threadpool=*/ state->threadpool, + /*.ith =*/ state->ith, + /*.nth =*/ state->threadpool->n_threads_cur, + /*.wsize =*/ cgraph->work_size, + /*.wdata =*/ cgraph->work_data, + /*.threadpool =*/ state->threadpool, }; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -19905,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) { state->threadpool->ec = GGML_STATUS_ABORTED; } @@ -20059,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, - struct ggml_cgraph * cgraph, - struct ggml_cplan * cplan) { + struct ggml_cgraph * cgraph) { struct ggml_threadpool * threadpool = GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; - threadpool->cplan = cplan; threadpool->n_graph = 0; threadpool->n_barrier = 0; threadpool->n_barrier_passed = 0; @@ -20124,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( } struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) { - return ggml_threadpool_new_impl(tpp, NULL, NULL); + return ggml_threadpool_new_impl(tpp, NULL); } -enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { - GGML_ASSERT(cplan); - GGML_ASSERT(cplan->n_threads > 0); - GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); +enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) { + GGML_ASSERT((cgraph->n_threads > 0 ) && "call ggml_graph_prepare first"); + GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first"); - int n_threads = cplan->n_threads; - struct ggml_threadpool * threadpool = cplan->threadpool; + int n_threads = cgraph->n_threads; + struct ggml_threadpool * threadpool = cgraph->threadpool; bool disposable_threadpool = false; @@ -20142,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl disposable_threadpool = true; struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); - threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan); + threadpool = ggml_threadpool_new_impl(&ttp, cgraph); } else { // Reset some of the parameters that need resetting // No worker threads should be accessing the parameters below at this stage - threadpool->cgraph = cgraph; - threadpool->cplan = cplan; - threadpool->n_threads_cur = n_threads; - threadpool->current_chunk = 0; - threadpool->ec = GGML_STATUS_SUCCESS; + threadpool->cgraph = cgraph; + threadpool->n_threads_cur = n_threads; + threadpool->current_chunk = 0; + threadpool->ec = GGML_STATUS_SUCCESS; } if (n_threads > threadpool->n_threads_max) { - GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); + GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n"); } #ifdef GGML_USE_OPENMP @@ -20193,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl return ret; } -enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); - - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - - return ggml_graph_compute(cgraph, &cplan); +void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) { + cgraph->abort_callback = abort_callback; + cgraph->abort_callback_data = abort_data; } struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { @@ -21010,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_prepare (gb, params.n_threads, NULL); + ggml_graph_work_init(gb, ctx); bool cancel = false; @@ -21028,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam( } // ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); + ggml_graph_compute(gb); ggml_opt_acc_grad(np, ps, g, accum_norm); fx += ggml_get_f32_1d(f, 0); } @@ -21119,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam( } // ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); + ggml_graph_compute(gb); ggml_opt_acc_grad(np, ps, g, accum_norm); fx += ggml_get_f32_1d(f, 0); } @@ -21204,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking( const float * xp, struct ggml_tensor * f, struct ggml_cgraph * gb, - struct ggml_cplan * cplan, const int np, struct ggml_tensor * ps[], bool * cancel, @@ -21261,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking( } // ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, cplan); + ggml_graph_compute(gb); ggml_opt_acc_grad(np, ps, g, accum_norm); *fx += ggml_get_f32_1d(f, 0); } @@ -21357,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_prepare (gb, params.n_threads, NULL); + ggml_graph_work_init(gb, ctx); float * x = opt->lbfgs.x->data; // current parameters float * xp = opt->lbfgs.xp->data; // previous parameters @@ -21404,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } // ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); + ggml_graph_compute(gb); ggml_opt_acc_grad(np, ps, g, accum_norm); fx += ggml_get_f32_1d(f, 0); } @@ -21470,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // to determine if the optimization should be cancelled // this is a simple change, but not doing this atm, since I don't have a nice // way to test and don't want to break something with so many changes lined up - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data); if (cancel) { return GGML_OPT_RESULT_CANCEL; } diff --git a/src/llama.cpp b/src/llama.cpp index 40db03517..a10341b73 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9877,8 +9877,8 @@ struct llm_build_context { struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - inp = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + inp = ggml_graph_node(gf, i); if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { break; } else { @@ -16205,8 +16205,8 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; + struct ggml_tensor * res = ggml_graph_node(gf, -1); + struct ggml_tensor * embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -16215,9 +16215,9 @@ static int llama_decode_internal( } else if (cparams.embeddings) { res = nullptr; // do not extract logits for embedding case embd = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { - embd = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + embd = ggml_graph_node(gf, i); break; } } @@ -16432,15 +16432,15 @@ static int llama_encode_internal( // there are two cases here if (llama_model_has_decoder(&lctx.model)) { // first case is an encoder-decoder T5 model where embeddings are passed to decoder - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); } else { // second case is an encoder-only T5 model if (cparams.embeddings) { // only output embeddings if required - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + embd = ggml_graph_node(gf, -2); } GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); } @@ -18488,7 +18488,7 @@ struct llama_context * llama_new_context_with_model( // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf)); LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 635de01d7..aa7896def 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -519,7 +519,7 @@ struct test_case { // add sentinels as graph nodes so that they are checked in the callback for (ggml_tensor * sentinel : sentinels) { - gf->nodes[gf->n_nodes++] = sentinel; + ggml_graph_add_node(gf, sentinel); } // randomize tensors @@ -679,9 +679,9 @@ struct test_case { // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU - int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; + int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; for (int i = 1; i < n_runs; i++) { - gf->nodes[gf->n_nodes++] = out; + ggml_graph_add_node(gf, out); } // calculate memory @@ -696,11 +696,11 @@ struct test_case { } return size; }; - for (int i = 0; i < gf->n_nodes; i++) { - if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) { continue; } - mem += tensor_op_size(gf->nodes[i]); + mem += tensor_op_size(ggml_graph_node(gf, i)); } // run @@ -804,7 +804,7 @@ struct test_case { ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, false); if (expect.size() != 1 || expect[0] != 0.0f) { - GGML_ASSERT(gb->n_nodes > gf->n_nodes); + GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); } diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 1834c11d8..f47cc5299 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -242,12 +242,16 @@ static bool check_gradient( ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx0, gf, gb, false); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); + ggml_graph_prepare(gf, n_threads, nullptr); + ggml_graph_work_init(gf, ctx0); + ggml_graph_compute(gf); ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx0, gb, n_threads); + ggml_graph_prepare(gb, n_threads, nullptr); + ggml_graph_work_init(gb, ctx0); + ggml_graph_compute(gb); // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot"); @@ -262,13 +266,17 @@ static bool check_gradient( const float xp = x0 + eps; ggml_set_f32_1d(x[i], k, xp); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); + ggml_graph_prepare(gf, n_threads, nullptr); + ggml_graph_work_init(gf, ctx0); + ggml_graph_compute(gf); const double f0 = ggml_get_f32_1d(f, 0); ggml_set_f32_1d(x[i], k, xm); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); + ggml_graph_prepare(gf, n_threads, nullptr); + ggml_graph_work_init(gf, ctx0); + ggml_graph_compute(gf); const double f1 = ggml_get_f32_1d(f, 0); const double g0 = (f0 - f1)/(2.0*(double) eps); @@ -301,7 +309,9 @@ static bool check_gradient( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx0, gb, n_threads); + ggml_graph_prepare(gb, n_threads, nullptr); + ggml_graph_work_init(gb, ctx0); + ggml_graph_compute(gb); const double g1 = ggml_get_f32_1d(x[i]->grad, k); diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 546ca230b..8bc10da7f 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -113,7 +113,10 @@ int main(void) { ggml_build_forward_expand(ge, e); ggml_graph_reset(ge); - ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); + ggml_graph_prepare(ge, 1, nullptr); + ggml_graph_work_init(ge, nullptr); + ggml_graph_compute(ge); + ggml_graph_work_free(ge); const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); @@ -124,7 +127,10 @@ int main(void) { ggml_graph_reset(ge); - ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); + ggml_graph_prepare(ge, 1, nullptr); + ggml_graph_work_init(ge, nullptr); + ggml_graph_compute(ge); + ggml_graph_work_free(ge); const float fe_opt = ggml_get_f32_1d(e, 0); printf("%s: original e = %.4f\n", __func__, fe); diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 246bb227d..6abcdf25c 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -112,17 +112,6 @@ static struct ggml_tensor * get_random_tensor_f32( return result; } -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - int main(int /*argc*/, const char ** /*argv*/) { struct ggml_init_params params = { /* .mem_size = */ 128*1024*1024, @@ -130,8 +119,6 @@ int main(int /*argc*/, const char ** /*argv*/) { /* .no_alloc = */ false, }; - std::vector work_buffer; - struct ggml_context * ctx0 = ggml_init(params); struct ggml_tensor * x; @@ -175,7 +162,10 @@ int main(int /*argc*/, const char ** /*argv*/) { ggml_build_forward_expand(gf, r1); ggml_build_forward_expand(gf, r2); - ggml_graph_compute_helper(work_buffer, gf, 4); + ggml_graph_prepare(gf, 4, nullptr); + ggml_graph_work_init(gf, nullptr); + ggml_graph_compute(gf); + ggml_graph_work_free(gf); // check that r1 and r2 are the same {