diff --git a/.gitignore b/.gitignore index 1aabe82fc..d231f3ff8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,5 @@ -/*.o -/*.a -/*.sh -/*.log -/*.org - -/ppl-*.txt -/qnt-*.txt -/perf-*.txt - -*.bin - +*.o +*.a .DS_Store .build/ .cache/ @@ -31,9 +21,8 @@ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ -prompts/ models/* -wikitext-2-raw/ +*.bin /main /quantize @@ -44,7 +33,6 @@ wikitext-2-raw/ /benchmark-matmult /vdot /Pipfile -/libllama.so build-info.h arm_neon.h @@ -55,4 +43,8 @@ __pycache__ zig-out/ zig-cache/ +ppl-*.txt +qnt-*.txt +perf-*.txt + examples/jeopardy/results.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index b46c2adf9..48e3238df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,7 +182,6 @@ if (LLAMA_CUBLAS) cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) - if (CUDAToolkit_FOUND) message(STATUS "cuBLAS found") @@ -198,23 +197,6 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() - if (CMAKE_VERSION VERSION_LESS 3.25) - if (NOT MSVC) - if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile_static) - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile) - endif() - else() - message(FATAL "TODO: cufile on Windows") - endif() - else() - if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile_static) - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile) - endif() - endif() else() message(WARNING "cuBLAS not found") endif() diff --git a/Makefile b/Makefile index 93c149edc..f9ec8797a 100644 --- a/Makefile +++ b/Makefile @@ -125,7 +125,7 @@ endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lcufile -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib + LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6dbbc7950..2be6ece7e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2,13 +2,11 @@ #include #include #include -#include #include #include #include #include -#include #include "ggml-cuda.h" #include "ggml.h" @@ -34,15 +32,6 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } \ } while (0) -#define CUFILE_CHECK(status) \ - do { \ - CUfileError_t status_ = (status); \ - if (status_.err != CU_FILE_SUCCESS) { \ - fprintf(stderr, "cuFile error %d at %s:%d\n", status_.err, __FILE__, __LINE__); \ - exit(1); \ - } \ - } while (0) - typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1); typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream); @@ -383,7 +372,7 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { +static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -402,7 +391,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } -void ggml_cuda_pool_free(void * ptr, size_t size) { +static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -442,9 +431,6 @@ void ggml_init_cublas() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); - - // initialize cuFile for loading model parameters directly to VRAM - CUFILE_CHECK(cuFileDriverOpen()); } } @@ -909,32 +895,6 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) { } void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) { - CUfileDescr_t cf_descr; - memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t)); - const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644); - cf_descr.handle.fd = fd_cf; - cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - - CUfileHandle_t cf_handle; - CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr); - - if (status.err == CU_FILE_INTERNAL_ERROR) { - fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). " - "This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname); - } else { - for (int i = 0; i < num_tensors; ++i) { - ggml_tensor * tensor = tensors[i]; - const size_t size = ggml_nbytes(tensor); - const size_t offset = offsets[i]; - - size_t actual_size; - void * buf = ggml_cuda_pool_malloc(size, &actual_size); - cuFileRead(cf_handle, buf, size, offset, 0); - tensor->data = buf; - } - return; - } - FILE * fp = fopen(fname, "rb"); for (int i = 0; i < num_tensors; ++i) { diff --git a/ggml-cuda.h b/ggml-cuda.h index b46a804c7..ab2c690b0 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -14,8 +14,6 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens // TODO: export these with GGML_API void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); -void ggml_cuda_pool_free(void * ptr, size_t size); void ggml_cuda_transform_tensor(struct ggml_tensor * tensor); void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets); diff --git a/llama.cpp b/llama.cpp index 431c7eaf6..6a822e6d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10,7 +10,6 @@ #include "ggml.h" #ifdef GGML_USE_CUBLAS -#include #include "ggml-cuda.h" #endif