ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)
* ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
db0fc2da06
commit
e04dc51988
4 changed files with 110 additions and 67 deletions
|
@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
|
|||
|
||||
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
||||
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_set_main_device(int main_device);
|
||||
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue