use async memcpys to copy the graph outputs to the CPU
This commit is contained in:
parent
23c14ef53e
commit
e73009ea51
3 changed files with 38 additions and 7 deletions
|
@ -772,6 +772,8 @@ struct ggml_backend_sched_split {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_sched {
|
struct ggml_backend_sched {
|
||||||
|
bool is_reset; // true if the scheduler has been reset since the last graph split
|
||||||
|
|
||||||
int n_backends;
|
int n_backends;
|
||||||
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
||||||
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
||||||
|
@ -968,6 +970,7 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
||||||
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
// reset splits
|
// reset splits
|
||||||
sched->n_splits = 0;
|
sched->n_splits = 0;
|
||||||
|
sched->is_reset = false;
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ sizeof(sched->context_buffer),
|
/* .mem_size = */ sizeof(sched->context_buffer),
|
||||||
|
@ -1327,6 +1330,8 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
||||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||||
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
||||||
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
||||||
|
|
||||||
|
sched->is_reset = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
|
||||||
|
@ -1352,6 +1357,8 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
|
||||||
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
|
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sched_reset(sched);
|
||||||
|
|
||||||
return sched;
|
return sched;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1389,9 +1396,16 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
||||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
|
if (!sched->is_reset) {
|
||||||
|
sched_reset(sched);
|
||||||
|
}
|
||||||
|
|
||||||
sched_split_graph(sched, graph);
|
sched_split_graph(sched, graph);
|
||||||
sched_alloc_splits(sched);
|
sched_alloc_splits(sched);
|
||||||
sched_compute_splits(sched);
|
sched_compute_splits(sched);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
sched_reset(sched);
|
sched_reset(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1417,6 +1431,14 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
||||||
node_allocr(node) = sched->tallocs[backend_index];
|
node_allocr(node) = sched->tallocs[backend_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
|
ggml_tallocr_t allocr = node_allocr(node);
|
||||||
|
if (allocr == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return get_allocr_backend(sched, allocr);
|
||||||
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
|
|
@ -160,11 +160,13 @@ extern "C" {
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API void ggml_backend_sched_graph_compute(
|
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
ggml_backend_sched_t sched,
|
|
||||||
struct ggml_cgraph * graph);
|
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
||||||
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Utils
|
// Utils
|
||||||
|
|
15
llama.cpp
15
llama.cpp
|
@ -6218,6 +6218,8 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
|
@ -6311,30 +6313,33 @@ static int llama_decode_internal(
|
||||||
logits_out.clear();
|
logits_out.clear();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
||||||
|
GGML_ASSERT(res_backend != nullptr);
|
||||||
if (batch.logits) {
|
if (batch.logits) {
|
||||||
logits_out.resize(n_vocab * n_tokens);
|
logits_out.resize(n_vocab * n_tokens);
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
if (batch.logits[i] == 0) {
|
if (batch.logits[i] == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
logits_valid[i] = true;
|
logits_valid[i] = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} else if (lctx.logits_all) {
|
} else if (lctx.logits_all) {
|
||||||
logits_out.resize(n_vocab * n_tokens);
|
logits_out.resize(n_vocab * n_tokens);
|
||||||
ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
logits_out.resize(n_vocab);
|
logits_out.resize(n_vocab);
|
||||||
ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
logits_valid[0] = true;
|
logits_valid[0] = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
ggml_backend_synchronize(res_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract embeddings
|
// extract embeddings
|
||||||
|
@ -6342,7 +6347,9 @@ static int llama_decode_internal(
|
||||||
auto & embedding_out = lctx.embedding;
|
auto & embedding_out = lctx.embedding;
|
||||||
|
|
||||||
embedding_out.resize(n_embd);
|
embedding_out.resize(n_embd);
|
||||||
ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
||||||
|
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
||||||
|
ggml_backend_synchronize(embeddings_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
// measure the performance only for the single-token evals
|
// measure the performance only for the single-token evals
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue