From cbab212a322b6a3f5f821624d825b360f941b1d7 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 28 May 2024 08:47:35 -0700 Subject: [PATCH] Restrict threadpool to CPU backend --- examples/llava/clip.cpp | 2 +- ggml-backend-impl.h | 4 +-- ggml-backend.c | 72 +++++++++++++++++++------------------- ggml-backend.h | 14 ++++---- ggml-cuda.cu | 8 ++--- ggml-kompute.cpp | 7 +--- ggml-metal.m | 7 +--- ggml-opencl.cpp | 7 +--- ggml-rpc.cpp | 5 ++- ggml-sycl.cpp | 8 +---- ggml-vulkan.cpp | 7 +--- ggml.c | 2 +- ggml.h | 2 +- llama.cpp | 5 +-- tests/test-backend-ops.cpp | 4 +-- 15 files changed, 61 insertions(+), 93 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 87f2d8b19..95fbe3d02 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } #endif - ggml_backend_graph_compute(ctx->backend, gf, NULL); + ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index b5d4d0f8d..950711dd5 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -92,14 +92,14 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph with a plan enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index ae185e7f9..9565e55d3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -255,13 +255,12 @@ void ggml_backend_synchronize(ggml_backend_t backend) { } ggml_backend_graph_plan_t ggml_backend_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { GGML_ASSERT(backend->iface.graph_plan_create != NULL); - return backend->iface.graph_plan_create(backend, cgraph, threadpool); + return backend->iface.graph_plan_create(backend, cgraph); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { @@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute( enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + struct ggml_cgraph * cgraph ) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ggml_backend_synchronize(backend); return err; } enum ggml_status ggml_backend_graph_compute_async( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { - return backend->iface.graph_compute(backend, cgraph, threadpool); + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -741,7 +738,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { #endif struct ggml_backend_cpu_context { - int n_threads; + int n_threads; + ggml_compute_threadpool_t threadpool; + void * work_data; size_t work_size; @@ -774,15 +773,14 @@ struct ggml_backend_plan_cpu { }; GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -817,13 +815,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe } GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; @@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->threadpool = threadpool; +} + void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static enum ggml_status ggml_backend_sched_compute_splits( - ggml_backend_sched_t sched, - ggml_compute_threadpool_t threadpool -) { +static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { @@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1852,19 +1854,17 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra } enum ggml_status ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { - enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool); + enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); ggml_backend_sched_synchronize(sched); return err; } enum ggml_status ggml_backend_sched_graph_compute_async( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); @@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async( } } - return ggml_backend_sched_compute_splits(sched, threadpool); + return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { @@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - ggml_backend_graph_compute(backend1, &g1v, NULL); - ggml_backend_graph_compute(backend2, &g2v, NULL); + ggml_backend_graph_compute(backend1, &g1v); + ggml_backend_graph_compute(backend2, &g2v); if (ggml_is_view_op(t1->op)) { continue; diff --git a/ggml-backend.h b/ggml-backend.h index b17d9770b..ba3a898cd 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -69,8 +69,7 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create( ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + const struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); @@ -79,12 +78,10 @@ extern "C" { ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -112,6 +109,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer @@ -205,8 +203,8 @@ extern "C" { // Allocate and compute graph on the backend scheduler GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); - GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); // Reset all assignments and allocators - must be called before changing the node backends diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d33f8a49b..b82167cbf 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2495,13 +2495,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_cuda_set_device(cuda_ctx->device); #ifdef USE_CUDA_GRAPH diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 90272d5f1..6c6058b2a 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1948,12 +1948,7 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g return ggml_backend_kompute_buffer_type(ctx->device); } -static ggml_status ggml_backend_kompute_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); return GGML_STATUS_SUCCESS; diff --git a/ggml-metal.m b/ggml-metal.m index 051ade2fc..c9e570dbf 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3103,12 +3103,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - UNUSED(threadpool); +GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 079a718a1..e28566a7b 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2235,12 +2235,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg GGML_UNUSED(backend); } -static ggml_status ggml_backend_opencl_graph_compute( - ggml_backend_t backend, - ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index f3a4fe827..cc1d3ace1 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -585,8 +585,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector & o memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); } -GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) { - UNUSED(tp); +GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; std::vector input; serialize_graph(cgraph, input); @@ -1021,7 +1020,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vectornodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map); } - ggml_status status = ggml_backend_graph_compute(backend, graph, NULL); + ggml_status status = ggml_backend_graph_compute(backend, graph); // output serialization format: | status (1 byte) | output.resize(1, 0); output[0] = status; diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 15d07dc7a..496ec61c3 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17022,13 +17022,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static ggml_status ggml_backend_sycl_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); - +GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 1f7923d65..79ce1479f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6225,12 +6225,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -GGML_CALL static ggml_status ggml_backend_vk_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl; #endif diff --git a/ggml.c b/ggml.c index 463b40f32..5a86c1717 100644 --- a/ggml.c +++ b/ggml.c @@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str int32_t base_idx = *iter; for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) { int32_t idx = base_idx + i; - if (idx > GGML_N_CORES_MAX) { + if (idx >= GGML_N_CORES_MAX) { // Just a cheaper modulo idx -= GGML_N_CORES_MAX; } diff --git a/ggml.h b/ggml.h index 314ea8b55..7020cf28f 100644 --- a/ggml.h +++ b/ggml.h @@ -2051,7 +2051,7 @@ extern "C" { const struct ggml_cgraph * cgraph, int n_threads, struct ggml_compute_threadpool * threadpool); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/llama.cpp b/llama.cpp index 69b441020..1fc26a399 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11366,10 +11366,11 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } - ggml_backend_sched_graph_compute_async(lctx.sched, gf, threadpool); + ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } @@ -15428,7 +15429,7 @@ static int llama_apply_lora_from_file_internal( return 1; } - ggml_backend_graph_compute(backend_cpu, gf, nullptr); + ggml_backend_graph_compute(backend_cpu, gf); ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f9d033b52..de74585da 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -587,7 +587,7 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU @@ -619,7 +619,7 @@ struct test_case { ggml_backend_synchronize(backend); int64_t start_time = ggml_time_us(); - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); double time_us = end_time - start_time;