cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
This commit is contained in:
parent
90b19bd6ee
commit
b925f1f1b0
3 changed files with 52 additions and 9 deletions
|
@ -727,8 +727,7 @@ struct llama_model_loader {
|
|||
LLAMA_ASSERT(offset == lt.size);
|
||||
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
||||
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
||||
std::vector<llama_buffer> tmp_bufs;
|
||||
tmp_bufs.resize(lt.shards.size());
|
||||
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
||||
for (size_t i = 0; i < lt.shards.size(); i++) {
|
||||
llama_load_tensor_shard & shard = lt.shards.at(i);
|
||||
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue