diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5fd625630..f2b89fc24 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6475,6 +6475,10 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo return; } + if (tensor->backend != GGML_BACKEND_CPU) { + return; + } + // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; diff --git a/llama.cpp b/llama.cpp index 2d552e2a7..4943bce39 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2337,7 +2337,6 @@ static struct ggml_cgraph * llm_build_llama( // Note that we do this even when N==1 so that we don't change the # nodes in the graph, // otherwise for Metal we'd have to rebuild the concurrency list. - offload_func(cur); cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd); offload_func(cur); ggml_set_name(cur, "cur-lastpos");