Fix multiple offloading

2023-08-28 17:44:10 +02:00 · 2023-08-28 17:44:10 +02:00 · e9e8ac4c82
commit e9e8ac4c82
parent 9f5b7813c6
2 changed files with 4 additions and 1 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6463,6 +6463,10 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
        return;
    }

+    if (tensor->backend != GGML_BACKEND_CPU) {
+        return;
+    }
+
    // recursively assign CUDA buffers until a compute tensor is found
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
--- a/llama.cpp
+++ b/llama.cpp
@ -2325,7 +2325,6 @@ static struct ggml_cgraph * llm_build_llama(
                // Note that we do this even when N==1 so that we don't change the # nodes in the graph,
                // otherwise for Metal we'd have to rebuild the concurrency list.

-                offload_func(cur);
                cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
                offload_func(cur);
                ggml_set_name(cur, "cur-lastpos");