cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
2023-05-01 13:32:22 +02:00 · 2023-05-01 13:32:22 +02:00 · b925f1f1b0
commit b925f1f1b0
parent 90b19bd6ee
3 changed files with 52 additions and 9 deletions
--- a/llama-util.h
+++ b/llama-util.h
@ -395,6 +395,8 @@ struct llama_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;

+    llama_buffer() = default;
+
    void resize(size_t size) {
        delete[] addr;
        addr = new uint8_t[size];
@ -404,27 +406,59 @@ struct llama_buffer {
    ~llama_buffer() {
        delete[] addr;
    }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
 };

 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
    uint8_t * addr = NULL;
+    bool is_cuda;
    size_t size = 0;

+    llama_ctx_buffer() = default;
+
    void resize(size_t size) {
-        if (addr) {
-            ggml_cuda_host_free(addr);
-        }
+        free();
+
        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        if (addr) {
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
+        }
        this->size = size;
    }

-    ~llama_ctx_buffer() {
+    void free() {
        if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
        }
+        addr = NULL;
    }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;