falcon : fix CUDA inference by making K and Q contiguous

ggml-ci
This commit is contained in:
Georgi Gerganov 2023-08-27 15:56:03 +03:00
parent 1591e2e590
commit 7c55447f7f

View file

@ -2635,18 +2635,20 @@ static struct ggml_cgraph * llm_build_falcon(
const size_t wsize = ggml_type_size(cur->type); const size_t wsize = ggml_type_size(cur->type);
struct ggml_tensor * tmpq = ggml_view_3d( // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
// non-contiguous views is added for the rope operator
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
ctx0, cur, n_embd_head, n_head, N, ctx0, cur, n_embd_head, n_head, N,
wsize * n_embd_head, wsize * n_embd_head,
wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + 2 * n_head_kv),
0); 0));
offload_func_kq(tmpq); offload_func_kq(tmpq);
struct ggml_tensor * tmpk = ggml_view_3d( struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
ctx0, cur, n_embd_head, n_head_kv, N, ctx0, cur, n_embd_head, n_head_kv, N,
wsize * n_embd_head, wsize * n_embd_head,
wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + 2 * n_head_kv),
wsize * n_embd_head * n_head); wsize * n_embd_head * n_head));
offload_func_kq(tmpk); offload_func_kq(tmpk);
struct ggml_tensor * tmpv = ggml_view_3d( struct ggml_tensor * tmpv = ggml_view_3d(