gg rebase fixup

This commit is contained in:
JohannesGaessler 2023-05-20 13:27:21 +02:00
parent 909acb3e3f
commit fee87f6558
6 changed files with 10 additions and 79 deletions

22
.gitignore vendored
View file

@ -1,15 +1,5 @@
/*.o
/*.a
/*.sh
/*.log
/*.org
/ppl-*.txt
/qnt-*.txt
/perf-*.txt
*.bin
*.o
*.a
.DS_Store
.build/
.cache/
@ -31,9 +21,8 @@ build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
prompts/
models/*
wikitext-2-raw/
*.bin
/main
/quantize
@ -44,7 +33,6 @@ wikitext-2-raw/
/benchmark-matmult
/vdot
/Pipfile
/libllama.so
build-info.h
arm_neon.h
@ -55,4 +43,8 @@ __pycache__
zig-out/
zig-cache/
ppl-*.txt
qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt

View file

@ -182,7 +182,6 @@ if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
@ -198,23 +197,6 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (CMAKE_VERSION VERSION_LESS 3.25)
if (NOT MSVC)
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile)
endif()
else()
message(FATAL "TODO: cufile on Windows")
endif()
else()
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile)
endif()
endif()
else()
message(WARNING "cuBLAS not found")
endif()

View file

@ -125,7 +125,7 @@ endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lcufile -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native

View file

@ -2,13 +2,11 @@
#include <cstdint>
#include <stdint.h>
#include <stdio.h>
#include <fcntl.h>
#include <atomic>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cufile.h>
#include "ggml-cuda.h"
#include "ggml.h"
@ -34,15 +32,6 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} \
} while (0)
#define CUFILE_CHECK(status) \
do { \
CUfileError_t status_ = (status); \
if (status_.err != CU_FILE_SUCCESS) { \
fprintf(stderr, "cuFile error %d at %s:%d\n", status_.err, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
@ -383,7 +372,7 @@ struct cuda_buffer {
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@ -402,7 +391,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}
void ggml_cuda_pool_free(void * ptr, size_t size) {
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@ -442,9 +431,6 @@ void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
// initialize cuFile for loading model parameters directly to VRAM
CUFILE_CHECK(cuFileDriverOpen());
}
}
@ -909,32 +895,6 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
}
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
CUfileDescr_t cf_descr;
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644);
cf_descr.handle.fd = fd_cf;
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
CUfileHandle_t cf_handle;
CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
if (status.err == CU_FILE_INTERNAL_ERROR) {
fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
"This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname);
} else {
for (int i = 0; i < num_tensors; ++i) {
ggml_tensor * tensor = tensors[i];
const size_t size = ggml_nbytes(tensor);
const size_t offset = offsets[i];
size_t actual_size;
void * buf = ggml_cuda_pool_malloc(size, &actual_size);
cuFileRead(cf_handle, buf, size, offset, 0);
tensor->data = buf;
}
return;
}
FILE * fp = fopen(fname, "rb");
for (int i = 0; i < num_tensors; ++i) {

View file

@ -14,8 +14,6 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
// TODO: export these with GGML_API
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void ggml_cuda_pool_free(void * ptr, size_t size);
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);

View file

@ -10,7 +10,6 @@
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include <cuda_runtime.h>
#include "ggml-cuda.h"
#endif