Reset schedule earlier to allow overlap with graph computation on device

Refs #6763
2024-04-26 06:16:56 -07:00 · 2024-04-26 06:16:56 -07:00 · a2beaffec8
commit a2beaffec8
parent 637e9a86c2
2 changed files with 13 additions and 5 deletions
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1780,12 +1780,15 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
    // reset state for the next run
    if(!sched->is_reset)
    {
        size_t hash_size = sched->hash_set.size;
        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
        sched->is_reset = true;
    }
    sched->is_alloc = false;
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -16773,6 +16773,11 @@ float * llama_get_logits(struct llama_context * ctx) {
 float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
    int32_t j = -1;
    // Reset state for the next run before the following backend sync,
    // to allow the CPU activities in the reset to overlap with device computation.
    ggml_backend_sched_reset(ctx->sched);
    llama_synchronize(ctx);
    try {