fixup! GPU weights not in RAM, direct loading with cuFile
This commit is contained in:
parent
1bfe5a9886
commit
24d5ddf67c
2 changed files with 9 additions and 5 deletions
|
@ -172,7 +172,7 @@ struct llama_mmap {
|
||||||
#ifdef _POSIX_MAPPED_FILES
|
#ifdef _POSIX_MAPPED_FILES
|
||||||
static constexpr bool SUPPORTED = true;
|
static constexpr bool SUPPORTED = true;
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
||||||
size = file->size;
|
size = file->size;
|
||||||
int fd = fileno(file->fp);
|
int fd = fileno(file->fp);
|
||||||
int flags = MAP_SHARED;
|
int flags = MAP_SHARED;
|
||||||
|
@ -184,9 +184,9 @@ struct llama_mmap {
|
||||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prefetch) {
|
if (prefetch > 0) {
|
||||||
// Advise the kernel to preload the mapped memory
|
// Advise the kernel to preload the mapped memory
|
||||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
}
|
}
|
||||||
|
|
|
@ -679,12 +679,16 @@ struct llama_model_loader {
|
||||||
|
|
||||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
size_t prefetch_size = 0;
|
||||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||||
data_size += lt.size;
|
data_size += lt.size;
|
||||||
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||||
|
prefetch_size += lt.size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, false));
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||||
if (!lmlock) {
|
if (!lmlock) {
|
||||||
// Don't call the callback since the actual loading will be lazy
|
// Don't call the callback since the actual loading will be lazy
|
||||||
// and we can't measure it.
|
// and we can't measure it.
|
||||||
|
@ -2317,7 +2321,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
|
|
||||||
// maybe this should in llama_model_loader
|
// maybe this should in llama_model_loader
|
||||||
if (model_loader->use_mmap) {
|
if (model_loader->use_mmap) {
|
||||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue