ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2023-09-20 13:00:28 +02:00 committed by GitHub
parent db0fc2da06
commit e04dc51988
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 110 additions and 67 deletions

2
ggml.c
View file

@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
}
// make a view of the destination
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
if (strlen(b->name) > 0) {
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
} else {