ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)
* ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
db0fc2da06
commit
e04dc51988
4 changed files with 110 additions and 67 deletions
2
ggml.c
2
ggml.c
|
@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
|
|||
}
|
||||
|
||||
// make a view of the destination
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
||||
struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
|
||||
if (strlen(b->name) > 0) {
|
||||
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
||||
} else {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue