diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2be6ece7e..385aa2fd6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -894,35 +894,31 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) { tensor->backend = GGML_BACKEND_CUDA; } -void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) { +void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) { FILE * fp = fopen(fname, "rb"); - for (int i = 0; i < num_tensors; ++i) { - ggml_tensor * tensor = tensors[i]; - const size_t size = ggml_nbytes(tensor); - const size_t offset = offsets[i]; + const size_t size = ggml_nbytes(tensor); - void * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - void * buf_host = malloc(size); + void * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + void * buf_host = malloc(size); #ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, SEEK_SET); + int ret = _fseeki64(fp, (__int64) offset, SEEK_SET); #else - int ret = fseek(fp, (long) offset, SEEK_SET); + int ret = fseek(fp, (long) offset, SEEK_SET); #endif - GGML_ASSERT(ret == 0); // same + GGML_ASSERT(ret == 0); // same - size_t ret2 = fread(buf_host, size, 1, fp); - if (ret2 != 1) { - fprintf(stderr, "unexpectedly reached end of file"); - exit(1); - } - - cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - - tensor->data = buf; - free(buf_host); + size_t ret2 = fread(buf_host, size, 1, fp); + if (ret2 != 1) { + fprintf(stderr, "unexpectedly reached end of file"); + exit(1); } + + cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + tensor->data = buf; + free(buf_host); } diff --git a/ggml-cuda.h b/ggml-cuda.h index ab2c690b0..6a04dde6c 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -16,7 +16,7 @@ void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); void ggml_cuda_transform_tensor(struct ggml_tensor * tensor); -void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets); +void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 6a822e6d3..258537a92 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,6 +1,7 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include #include #include #endif @@ -720,9 +721,6 @@ struct llama_model_loader { lmlock->grow_to(done_size); } } - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } } void load_data_for(llama_load_tensor & lt) { @@ -1104,20 +1102,31 @@ static void llama_model_load_internal( #ifdef GGML_USE_CUBLAS { - std::vector tensors; - std::vector offsets; + size_t done_size = 0; + size_t data_size = 0; + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + done_size += lt.size; + } + } for (llama_load_tensor & lt : ml->tensors_map.tensors) { if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) { continue; } - tensors.emplace_back(lt.ggml_tensor); - LLAMA_ASSERT(lt.shards.size() == 1); - offsets.emplace_back(lt.shards.at(0).file_off); + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off); + done_size += lt.size; } - ggml_cuda_load_data(fname.c_str(), tensors.data(), tensors.size(), offsets.data()); } #endif // GGML_USE_CUBLAS + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so