diff --git a/.gitignore b/.gitignore
index 1aabe82fc..d231f3ff8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,5 @@
-/*.o
-/*.a
-/*.sh
-/*.log
-/*.org
-
-/ppl-*.txt
-/qnt-*.txt
-/perf-*.txt
-
-*.bin
-
+*.o
+*.a
 .DS_Store
 .build/
 .cache/
@@ -31,9 +21,8 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 
-prompts/
 models/*
-wikitext-2-raw/
+*.bin
 
 /main
 /quantize
@@ -44,7 +33,6 @@ wikitext-2-raw/
 /benchmark-matmult
 /vdot
 /Pipfile
-/libllama.so
 
 build-info.h
 arm_neon.h
@@ -55,4 +43,8 @@ __pycache__
 zig-out/
 zig-cache/
 
+ppl-*.txt
+qnt-*.txt
+perf-*.txt
+
 examples/jeopardy/results.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b46c2adf9..48e3238df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,7 +182,6 @@ if (LLAMA_CUBLAS)
     cmake_minimum_required(VERSION 3.17)
 
     find_package(CUDAToolkit)
-
     if (CUDAToolkit_FOUND)
         message(STATUS "cuBLAS found")
 
@@ -198,23 +197,6 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
-        if (CMAKE_VERSION VERSION_LESS 3.25)
-            if (NOT MSVC)
-                if (LLAMA_STATIC)
-                    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile_static)
-                else()
-                    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile)
-                endif()
-            else()
-                message(FATAL "TODO: cufile on Windows")
-            endif()
-        else()
-            if (LLAMA_STATIC)
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile_static)
-            else()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile)
-            endif()
-        endif()
     else()
         message(WARNING "cuBLAS not found")
     endif()
diff --git a/Makefile b/Makefile
index 93c149edc..f9ec8797a 100644
--- a/Makefile
+++ b/Makefile
@@ -125,7 +125,7 @@ endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lcufile -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 6dbbc7950..2be6ece7e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2,13 +2,11 @@
 #include <cstdint>
 #include <stdint.h>
 #include <stdio.h>
-#include <fcntl.h>
 #include <atomic>
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
-#include <cufile.h>
 
 #include "ggml-cuda.h"
 #include "ggml.h"
@@ -34,15 +32,6 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
         }                                                                               \
     } while (0)
 
-#define CUFILE_CHECK(status)                                                                \
-    do {                                                                                    \
-        CUfileError_t status_ = (status);                                                   \
-        if (status_.err != CU_FILE_SUCCESS) {                                               \
-            fprintf(stderr, "cuFile error %d at %s:%d\n", status_.err, __FILE__, __LINE__); \
-            exit(1);                                                                        \
-        }                                                                                   \
-    } while (0)
-
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
 typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
@@ -383,7 +372,7 @@ struct cuda_buffer {
 static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -402,7 +391,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     return ptr;
 }
 
-void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -442,9 +431,6 @@ void ggml_init_cublas() {
 
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-        // initialize cuFile for loading model parameters directly to VRAM
-        CUFILE_CHECK(cuFileDriverOpen());
     }
 }
 
@@ -909,32 +895,6 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
 }
 
 void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
-    CUfileDescr_t cf_descr;
-    memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
-    const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644);
-    cf_descr.handle.fd = fd_cf;
-    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
-
-    CUfileHandle_t cf_handle;
-    CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
-
-    if (status.err == CU_FILE_INTERNAL_ERROR) {
-        fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
-                "This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname);
-    } else {
-        for (int i = 0; i < num_tensors; ++i) {
-            ggml_tensor * tensor = tensors[i];
-            const size_t size = ggml_nbytes(tensor);
-            const size_t offset = offsets[i];
-
-            size_t actual_size;
-            void * buf = ggml_cuda_pool_malloc(size, &actual_size);
-            cuFileRead(cf_handle, buf, size, offset, 0);
-            tensor->data = buf;
-        }
-        return;
-    }
-
     FILE * fp = fopen(fname, "rb");
 
     for (int i = 0; i < num_tensors; ++i) {
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b46a804c7..ab2c690b0 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -14,8 +14,6 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 // TODO: export these with GGML_API
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
-void   ggml_cuda_pool_free(void * ptr, size_t size);
 
 void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
 void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
diff --git a/llama.cpp b/llama.cpp
index 431c7eaf6..6a822e6d3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10,7 +10,6 @@
 
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
-#include <cuda_runtime.h>
 #include "ggml-cuda.h"
 #endif