llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

This commit is contained in:
slaren 2024-01-08 18:17:18 +01:00
parent 11583c1462
commit 4ed5f621be
2 changed files with 20 additions and 6 deletions

View file

@ -9750,7 +9750,6 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
}
char * buf_host = (char *)data + offset_split;
//CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
}
}

View file

@ -2446,6 +2446,18 @@ struct llama_model_loader {
}
}
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
GGML_ASSERT(mapping);
*first = mapping->size;
*last = 0;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
const size_t offs = file_offset(ggml_get_name(tensor));
*first = std::min(*first, offs);
*last = std::max(*last, offs + ggml_nbytes(tensor));
}
}
// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const {
const size_t offs = file_offset(ggml_get_name(cur));
@ -3733,15 +3745,19 @@ static bool llm_load_tensors(
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = nullptr;
// TODO: do not use the whole model mapping for the buffer, only the region containing the tensors
// this is important for metal: if the entire model could be mapped to a metal buffer, then we could use metal for all layers
// only the region containing the tensors in the model is mapped to the backend buffer
// this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
}
#ifdef GGML_USE_METAL
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
const size_t max_size = ggml_get_max_tensor_size(ctx);
buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
}
#endif
else {
@ -5922,7 +5938,6 @@ static struct ggml_cgraph * llama_build_graph(
alloc_inp_tokens = true;
}
// FIXME: allocating this conditionally will result in issues with the measure allocator
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
ggml_tallocr_alloc(lctx.alloc, cur);