add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2) changing this value may improve performance for some systems, but increases memory usage
This commit is contained in:
parent
00a415d19b
commit
89bfa1f2ed
5 changed files with 55 additions and 31 deletions
|
@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL"
|
||||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||||
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
||||||
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
|
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
|
||||||
|
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
include(CheckCXXCompilerFlag)
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
|
||||||
|
|
||||||
# enable libstdc++ assertions for debug builds
|
# enable libstdc++ assertions for debug builds
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
|
||||||
MK_CPPFLAGS += -D_BSD_SOURCE
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SCHED_MAX_COPIES
|
||||||
|
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_CFLAGS += -O0 -g
|
MK_CFLAGS += -O0 -g
|
||||||
MK_CXXFLAGS += -O0 -g
|
MK_CXXFLAGS += -O0 -g
|
||||||
|
|
|
@ -387,7 +387,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
||||||
|
|
||||||
// backend registry
|
// backend registry
|
||||||
|
|
||||||
#define GGML_MAX_BACKENDS_REG 16
|
#define GGML_REG_MAX_BACKENDS 16
|
||||||
|
|
||||||
struct ggml_backend_reg {
|
struct ggml_backend_reg {
|
||||||
char name[128];
|
char name[128];
|
||||||
|
@ -396,7 +396,7 @@ struct ggml_backend_reg {
|
||||||
void * user_data;
|
void * user_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
||||||
static size_t ggml_backend_registry_count = 0;
|
static size_t ggml_backend_registry_count = 0;
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
||||||
|
@ -441,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
||||||
|
|
||||||
size_t id = ggml_backend_registry_count;
|
size_t id = ggml_backend_registry_count;
|
||||||
|
|
||||||
|
@ -993,16 +993,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
|
|
||||||
// scheduler
|
// scheduler
|
||||||
|
|
||||||
#define GGML_MAX_BACKENDS 16
|
#ifndef GGML_SCHED_MAX_BACKENDS
|
||||||
#define GGML_MAX_SPLITS 256
|
#define GGML_SCHED_MAX_BACKENDS 16
|
||||||
#define GGML_MAX_SPLIT_INPUTS 16
|
#endif
|
||||||
#define GGML_MAX_COPIES 2
|
|
||||||
|
#ifndef GGML_SCHED_MAX_SPLITS
|
||||||
|
#define GGML_SCHED_MAX_SPLITS 256
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
|
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
#define GGML_SCHED_MAX_COPIES 4
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_sched_split {
|
struct ggml_backend_sched_split {
|
||||||
int backend_id;
|
int backend_id;
|
||||||
int i_start;
|
int i_start;
|
||||||
int i_end;
|
int i_end;
|
||||||
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
||||||
int n_inputs;
|
int n_inputs;
|
||||||
// graph view of this split
|
// graph view of this split
|
||||||
struct ggml_cgraph graph;
|
struct ggml_cgraph graph;
|
||||||
|
@ -1014,15 +1025,15 @@ struct ggml_backend_sched {
|
||||||
|
|
||||||
int n_backends;
|
int n_backends;
|
||||||
|
|
||||||
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
||||||
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
||||||
ggml_gallocr_t galloc;
|
ggml_gallocr_t galloc;
|
||||||
|
|
||||||
// hash keys of the nodes in the graph
|
// hash keys of the nodes in the graph
|
||||||
struct ggml_hash_set hash_set;
|
struct ggml_hash_set hash_set;
|
||||||
// hash values
|
// hash values
|
||||||
int * tensor_backend_id;
|
int * tensor_backend_id;
|
||||||
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
|
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||||
|
|
||||||
int * node_backend_ids; // [graph_size]
|
int * node_backend_ids; // [graph_size]
|
||||||
int * leaf_backend_ids; // [graph_size]
|
int * leaf_backend_ids; // [graph_size]
|
||||||
|
@ -1031,14 +1042,14 @@ struct ggml_backend_sched {
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph * graph;
|
||||||
|
|
||||||
// graph splits
|
// graph splits
|
||||||
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
||||||
int n_splits;
|
int n_splits;
|
||||||
|
|
||||||
// pipeline parallelism support
|
// pipeline parallelism support
|
||||||
int n_copies;
|
int n_copies;
|
||||||
int cur_copy;
|
int cur_copy;
|
||||||
ggml_backend_event_t events[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
|
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||||
struct ggml_tensor * graph_inputs[GGML_MAX_SPLIT_INPUTS];
|
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
||||||
int n_graph_inputs;
|
int n_graph_inputs;
|
||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
|
@ -1052,7 +1063,7 @@ struct ggml_backend_sched {
|
||||||
#else
|
#else
|
||||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||||
#endif
|
#endif
|
||||||
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
||||||
|
@ -1089,7 +1100,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
||||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||||
#define GET_CAUSE(node) causes[hash_id(node)]
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
||||||
#else
|
#else
|
||||||
|
@ -1395,7 +1406,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (tensor_backend_id != cur_backend_id) {
|
if (tensor_backend_id != cur_backend_id) {
|
||||||
sched->splits[cur_split].i_end = i;
|
sched->splits[cur_split].i_end = i;
|
||||||
cur_split++;
|
cur_split++;
|
||||||
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
||||||
sched->splits[cur_split].backend_id = tensor_backend_id;
|
sched->splits[cur_split].backend_id = tensor_backend_id;
|
||||||
sched->splits[cur_split].i_start = i;
|
sched->splits[cur_split].i_start = i;
|
||||||
sched->splits[cur_split].n_inputs = 0;
|
sched->splits[cur_split].n_inputs = 0;
|
||||||
|
@ -1433,7 +1444,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_graph_inputs = sched->n_graph_inputs++;
|
int n_graph_inputs = sched->n_graph_inputs++;
|
||||||
GGML_ASSERT(n_graph_inputs < GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->graph_inputs[n_graph_inputs] = src;
|
sched->graph_inputs[n_graph_inputs] = src;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1455,7 +1466,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||||
|
@ -1507,7 +1518,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
@ -1683,13 +1694,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
size_t graph_size,
|
size_t graph_size,
|
||||||
bool parallel) {
|
bool parallel) {
|
||||||
GGML_ASSERT(n_backends > 0);
|
GGML_ASSERT(n_backends > 0);
|
||||||
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
||||||
|
@ -1697,9 +1708,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
sched->n_copies = parallel ? GGML_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
GGML_ASSERT(sched->n_copies <= GGML_MAX_COPIES);
|
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
||||||
|
|
||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
sched->backends[b] = backends[b];
|
sched->backends[b] = backends[b];
|
||||||
|
@ -1764,7 +1775,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
|
|
||||||
|
@ -1812,6 +1823,10 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||||
return sched->n_splits;
|
return sched->n_splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
||||||
|
return sched->n_copies;
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|
|
@ -184,6 +184,7 @@ extern "C" {
|
||||||
|
|
||||||
// Get the number of splits of the last graph
|
// Get the number of splits of the last graph
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||||
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
|
|
|
@ -13005,11 +13005,12 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// currently this is only implemented in the CUDA backend
|
// currently this is only implemented in the CUDA backend
|
||||||
pipeline_parallel = false;
|
pipeline_parallel = false;
|
||||||
#endif
|
#endif
|
||||||
if (pipeline_parallel) {
|
|
||||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
|
||||||
}
|
|
||||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
||||||
|
|
||||||
|
if (pipeline_parallel) {
|
||||||
|
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
||||||
|
}
|
||||||
|
|
||||||
// build worst-case graph
|
// build worst-case graph
|
||||||
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||||
int n_past = cparams.n_ctx - n_tokens;
|
int n_past = cparams.n_ctx - n_tokens;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue