cuda: add q8_0->f32 cpy operation (#9571)

llama: enable K-shift for quantized KV cache It will fail on unsupported backends or quant types.
2024-09-24 03:14:24 +03:00 · 2024-09-24 03:14:24 +03:00 · 116efee0ee
commit 116efee0ee
parent 0b3bf966f4
3 changed files with 82 additions and 9 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -2899,6 +2899,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
                    return true;
                }
+                if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                    return true;
                }