diff --git a/src/llama.cpp b/src/llama.cpp index 4d89c5222..c9e11e688 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17181,7 +17181,7 @@ static void llama_output_reorder(struct llama_context * ctx) { } } -static void llama_graph_compute( +static enum ggml_status llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, int n_threads, @@ -17196,12 +17196,14 @@ static void llama_graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); - if (err != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err); + auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); + + return status; } // decode a batch of tokens by evaluating the transformer @@ -17387,7 +17389,18 @@ static int llama_decode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } // update the kv ring buffer { @@ -17624,7 +17637,18 @@ static int llama_encode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } // extract embeddings if (embd) {