Direct I/O and Transparent HugePages

--direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.
2024-05-20 21:55:33 +02:00 · 2024-05-20 21:55:33 +02:00 · 1b17ed7ab6
commit 1b17ed7ab6
parent 917dc8cfa6
10 changed files with 297 additions and 30 deletions
--- a/llama.h
+++ b/llama.h
@ -260,6 +260,7 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;    // only load the vocabulary, no weights
        bool use_mmap;      // use mmap if possible
+        bool use_direct_io; // use direct I/O if possible
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
    };
@ -409,6 +410,7 @@ extern "C" {
    LLAMA_API size_t llama_max_devices(void);

    LLAMA_API bool llama_supports_mmap       (void);
+    LLAMA_API bool llama_supports_direct_io  (void);
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);