From 7c55447f7f25ac2bfbb6fb17dafebe652060a5df Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 15:56:03 +0300 Subject: [PATCH] falcon : fix CUDA inference by making K and Q contiguous ggml-ci --- llama.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index e956c0163..f97679e61 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2635,18 +2635,20 @@ static struct ggml_cgraph * llm_build_falcon( const size_t wsize = ggml_type_size(cur->type); - struct ggml_tensor * tmpq = ggml_view_3d( + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); + 0)); offload_func_kq(tmpq); - struct ggml_tensor * tmpk = ggml_view_3d( + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); + wsize * n_embd_head * n_head)); offload_func_kq(tmpk); struct ggml_tensor * tmpv = ggml_view_3d(