Restrict threadpool to CPU backend
This commit is contained in:
parent
1d9d39a18e
commit
cbab212a32
15 changed files with 61 additions and 93 deletions
|
@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_graph_compute(ctx->backend, gf, NULL);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||||
|
|
|
@ -92,14 +92,14 @@ extern "C" {
|
||||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan (not used currently)
|
// compute graph with a plan (not used currently)
|
||||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool);
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan (async)
|
// compute graph without a plan (async)
|
||||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool);
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
|
@ -256,12 +256,11 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
|
|
||||||
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
const struct ggml_cgraph * cgraph,
|
const struct ggml_cgraph * cgraph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
||||||
|
|
||||||
return backend->iface.graph_plan_create(backend, cgraph, threadpool);
|
return backend->iface.graph_plan_create(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute(
|
||||||
|
|
||||||
enum ggml_status ggml_backend_graph_compute(
|
enum ggml_status ggml_backend_graph_compute(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool);
|
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_graph_compute_async(
|
enum ggml_status ggml_backend_graph_compute_async(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
return backend->iface.graph_compute(backend, cgraph, threadpool);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
@ -742,6 +739,8 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
ggml_compute_threadpool_t threadpool;
|
||||||
|
|
||||||
void * work_data;
|
void * work_data;
|
||||||
size_t work_size;
|
size_t work_size;
|
||||||
|
|
||||||
|
@ -775,14 +774,13 @@ struct ggml_backend_plan_cpu {
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(
|
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
const struct ggml_cgraph * cgraph,
|
const struct ggml_cgraph * cgraph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
|
@ -818,12 +816,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
free(cpu_ctx->work_data);
|
free(cpu_ctx->work_data);
|
||||||
|
@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
ctx->threadpool = NULL;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
ctx->abort_callback = NULL;
|
ctx->abort_callback = NULL;
|
||||||
|
@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
ctx->threadpool = threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_sched_compute_splits(
|
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
ggml_backend_sched_t sched,
|
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
|
||||||
struct ggml_backend_sched_split * splits = sched->splits;
|
struct ggml_backend_sched_split * splits = sched->splits;
|
||||||
|
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
|
@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!sched->callback_eval) {
|
if (!sched->callback_eval) {
|
||||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool);
|
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||||
if (ec != GGML_STATUS_SUCCESS) {
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
return ec;
|
return ec;
|
||||||
}
|
}
|
||||||
|
@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
|
||||||
|
|
||||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||||
|
|
||||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool);
|
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
||||||
if (ec != GGML_STATUS_SUCCESS) {
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
return ec;
|
return ec;
|
||||||
}
|
}
|
||||||
|
@ -1853,18 +1855,16 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||||
|
|
||||||
enum ggml_status ggml_backend_sched_graph_compute(
|
enum ggml_status ggml_backend_sched_graph_compute(
|
||||||
ggml_backend_sched_t sched,
|
ggml_backend_sched_t sched,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool);
|
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
||||||
ggml_backend_sched_synchronize(sched);
|
ggml_backend_sched_synchronize(sched);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_sched_graph_compute_async(
|
enum ggml_status ggml_backend_sched_graph_compute_async(
|
||||||
ggml_backend_sched_t sched,
|
ggml_backend_sched_t sched,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph
|
||||||
ggml_compute_threadpool_t threadpool
|
|
||||||
) {
|
) {
|
||||||
if (!sched->is_reset && !sched->is_alloc) {
|
if (!sched->is_reset && !sched->is_alloc) {
|
||||||
ggml_backend_sched_reset(sched);
|
ggml_backend_sched_reset(sched);
|
||||||
|
@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ggml_backend_sched_compute_splits(sched, threadpool);
|
return ggml_backend_sched_compute_splits(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||||
|
@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||||
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
||||||
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
||||||
|
|
||||||
ggml_backend_graph_compute(backend1, &g1v, NULL);
|
ggml_backend_graph_compute(backend1, &g1v);
|
||||||
ggml_backend_graph_compute(backend2, &g2v, NULL);
|
ggml_backend_graph_compute(backend2, &g2v);
|
||||||
|
|
||||||
if (ggml_is_view_op(t1->op)) {
|
if (ggml_is_view_op(t1->op)) {
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -69,8 +69,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
const struct ggml_cgraph * cgraph,
|
const struct ggml_cgraph * cgraph);
|
||||||
ggml_compute_threadpool_t threadpool);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
|
@ -79,12 +78,10 @@ extern "C" {
|
||||||
ggml_backend_graph_plan_t plan);
|
ggml_backend_graph_plan_t plan);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute(
|
GGML_API enum ggml_status ggml_backend_graph_compute(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph);
|
||||||
ggml_compute_threadpool_t threadpool);
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph);
|
||||||
ggml_compute_threadpool_t threadpool);
|
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
@ -112,6 +109,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
|
@ -205,8 +203,8 @@ extern "C" {
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Reset all assignments and allocators - must be called before changing the node backends
|
// Reset all assignments and allocators - must be called before changing the node backends
|
||||||
|
|
|
@ -2495,13 +2495,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
ggml_cgraph * cgraph,
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
GGML_UNUSED(threadpool);
|
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(cuda_ctx->device);
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
|
|
||||||
#ifdef USE_CUDA_GRAPH
|
#ifdef USE_CUDA_GRAPH
|
||||||
|
|
|
@ -1948,12 +1948,7 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
|
||||||
return ggml_backend_kompute_buffer_type(ctx->device);
|
return ggml_backend_kompute_buffer_type(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_status ggml_backend_kompute_graph_compute(
|
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
struct ggml_cgraph * cgraph
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
GGML_UNUSED(threadpool);
|
|
||||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
||||||
ggml_vk_graph_compute(ctx, cgraph);
|
ggml_vk_graph_compute(ctx, cgraph);
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
|
|
|
@ -3103,12 +3103,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(
|
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
struct ggml_cgraph * cgraph,
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
UNUSED(threadpool);
|
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
|
|
|
@ -2235,12 +2235,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_status ggml_backend_opencl_graph_compute(
|
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
ggml_cgraph * graph,
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
GGML_UNUSED(threadpool);
|
|
||||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||||
ggml_tensor * node = graph->nodes[i];
|
ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
|
|
@ -585,8 +585,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
||||||
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) {
|
GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
UNUSED(tp);
|
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
std::vector<uint8_t> input;
|
std::vector<uint8_t> input;
|
||||||
serialize_graph(cgraph, input);
|
serialize_graph(cgraph, input);
|
||||||
|
@ -1021,7 +1020,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
|
||||||
for (uint32_t i = 0; i < n_nodes; i++) {
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
||||||
graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
|
graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
|
||||||
}
|
}
|
||||||
ggml_status status = ggml_backend_graph_compute(backend, graph, NULL);
|
ggml_status status = ggml_backend_graph_compute(backend, graph);
|
||||||
// output serialization format: | status (1 byte) |
|
// output serialization format: | status (1 byte) |
|
||||||
output.resize(1, 0);
|
output.resize(1, 0);
|
||||||
output[0] = status;
|
output[0] = status;
|
||||||
|
|
|
@ -17022,13 +17022,7 @@ catch (sycl::exception const &exc) {
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(
|
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
ggml_cgraph * cgraph,
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
GGML_UNUSED(threadpool);
|
|
||||||
|
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||||
|
|
||||||
|
|
|
@ -6225,12 +6225,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
||||||
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_t backend,
|
|
||||||
ggml_cgraph * cgraph,
|
|
||||||
ggml_compute_threadpool_t threadpool) {
|
|
||||||
|
|
||||||
GGML_UNUSED(threadpool);
|
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str
|
||||||
int32_t base_idx = *iter;
|
int32_t base_idx = *iter;
|
||||||
for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) {
|
for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) {
|
||||||
int32_t idx = base_idx + i;
|
int32_t idx = base_idx + i;
|
||||||
if (idx > GGML_N_CORES_MAX) {
|
if (idx >= GGML_N_CORES_MAX) {
|
||||||
// Just a cheaper modulo
|
// Just a cheaper modulo
|
||||||
idx -= GGML_N_CORES_MAX;
|
idx -= GGML_N_CORES_MAX;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11366,10 +11366,11 @@ static void llama_graph_compute(
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||||
|
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_graph_compute_async(lctx.sched, gf, threadpool);
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
||||||
|
|
||||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||||
}
|
}
|
||||||
|
@ -15428,7 +15429,7 @@ static int llama_apply_lora_from_file_internal(
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_graph_compute(backend_cpu, gf, nullptr);
|
ggml_backend_graph_compute(backend_cpu, gf);
|
||||||
|
|
||||||
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||||
|
|
||||||
|
|
|
@ -587,7 +587,7 @@ struct test_case {
|
||||||
ggml_build_forward_expand(gf, out);
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
ggml_backend_graph_compute(backend, gf, nullptr);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
// duplicate the op
|
// duplicate the op
|
||||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||||
|
@ -619,7 +619,7 @@ struct test_case {
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
|
|
||||||
int64_t start_time = ggml_time_us();
|
int64_t start_time = ggml_time_us();
|
||||||
ggml_backend_graph_compute(backend, gf, nullptr);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
int64_t end_time = ggml_time_us();
|
int64_t end_time = ggml_time_us();
|
||||||
double time_us = end_time - start_time;
|
double time_us = end_time - start_time;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue