From 1bfe5a98867a6057396b6ca4829cedc5c3ae0a10 Mon Sep 17 00:00:00 2001
From: JohannesGaessler <johannesg@5d6.de>
Date: Thu, 18 May 2023 23:57:13 +0200
Subject: [PATCH] fixup! GPU weights not in RAM, direct loading with cuFile

---
 ggml-cuda.cu | 48 ++++++++++++++++++++++++++++++++++++++----------
 ggml-cuda.h  |  4 +---
 llama.cpp    | 24 +-----------------------
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 893c7d4a9..6e6e12aaa 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -383,7 +383,7 @@ struct cuda_buffer {
 static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -402,7 +402,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     return ptr;
 }
 
-void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -908,7 +908,7 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
     tensor->backend = GGML_BACKEND_CUDA;
 }
 
-bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
     CUfileDescr_t cf_descr;
     memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
     const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644);
@@ -921,20 +921,48 @@ bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensor
     if (status.err == CU_FILE_INTERNAL_ERROR) {
         fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
                 "This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname);
+    } else {
+        for (int i = 0; i < num_tensors; ++i) {
+            ggml_tensor * tensor = tensors[i];
+            const size_t size = ggml_nbytes(tensor);
+            const size_t offset = offsets[i];
+
+            size_t actual_size;
+            void * buf = ggml_cuda_pool_malloc(size, &actual_size);
+            cuFileRead(cf_handle, buf, size, offset, 0);
+            tensor->data = buf;
+        }
+        return;
     }
-    if (status.err != CU_FILE_SUCCESS) {
-        return false;
-    }
+
+    FILE * fp = fopen(fname, "rb");
 
     for (int i = 0; i < num_tensors; ++i) {
         ggml_tensor * tensor = tensors[i];
         const size_t size = ggml_nbytes(tensor);
         const size_t offset = offsets[i];
 
-        size_t actual_size;
-        void * buf = ggml_cuda_pool_malloc(size, &actual_size);
-        cuFileRead(cf_handle, buf, size, offset, 0);
+        void * buf;
+        CUDA_CHECK(cudaMalloc(&buf, size));
+        void * buf_host = malloc(size);
+
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
+#else
+        int ret = fseek(fp, (long) offset, SEEK_SET);
+#endif
+        GGML_ASSERT(ret == 0); // same
+
+        size_t ret2 = fread(buf_host, size, 1, fp);
+        if (ret2 != 1) {
+            fprintf(stderr, "unexpectedly reached end of file");
+            exit(1);
+        }
+
+        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+        cudaDeviceSynchronize();
+
         tensor->data = buf;
+        free(buf_host);
     }
-    return true;
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 7485af9f7..ab2c690b0 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -14,11 +14,9 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 // TODO: export these with GGML_API
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
-void   ggml_cuda_pool_free(void * ptr, size_t size);
 
 void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
-bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index 644b6a57b..8312ffedf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10,7 +10,6 @@
 
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
-#include <cuda_runtime.h>
 #include "ggml-cuda.h"
 #endif
 
@@ -1069,28 +1068,7 @@ static void llama_model_load_internal(
             LLAMA_ASSERT(lt.shards.size() == 1);
             offsets.emplace_back(lt.shards.at(0).file_off);
         }
-        bool cufile_success = ggml_cuda_load_data_cufile(fname.c_str(), tensors.data(), tensors.size(), offsets.data());
-
-        if (!cufile_success) {
-            for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-                if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
-                    continue;
-                }
-                size_t actual_size;
-                void * buf = ggml_cuda_pool_malloc(lt.size, &actual_size);
-                void * buf_host = ggml_cuda_host_malloc(lt.size);
-
-                llama_file & file = ml->file_loaders.at(lt.shards.at(0).file_idx)->file;
-                file.seek(lt.shards.at(0).file_off, SEEK_SET);
-                file.read_raw(buf_host, lt.size);
-
-                cudaMemcpy(buf, buf_host, lt.size, cudaMemcpyHostToDevice);
-                cudaDeviceSynchronize();
-
-                lt.ggml_tensor->data = buf;
-                ggml_cuda_host_free(buf_host);
-            }
-        }
+        ggml_cuda_load_data(fname.c_str(), tensors.data(), tensors.size(), offsets.data());
     }
 #endif // GGML_USE_CUBLAS