cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
This commit is contained in:
parent
b1ee8f59b4
commit
7fc50c051a
6 changed files with 110 additions and 54 deletions
|
@ -136,7 +136,7 @@ struct llama_kv_cache {
|
|||
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
llama_buffer buf;
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
|
||||
|
@ -167,7 +167,7 @@ struct llama_model {
|
|||
struct llama_kv_cache kv_self;
|
||||
|
||||
// the model memory buffer
|
||||
llama_buffer buf;
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
// model memory mapped file
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
|
@ -228,8 +228,8 @@ struct llama_context {
|
|||
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
llama_buffer buf_compute;
|
||||
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
llama_ctx_buffer buf_compute;
|
||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue