gg rebase fixup
This commit is contained in:
parent
909acb3e3f
commit
fee87f6558
6 changed files with 10 additions and 79 deletions
22
.gitignore
vendored
22
.gitignore
vendored
|
@ -1,15 +1,5 @@
|
|||
/*.o
|
||||
/*.a
|
||||
/*.sh
|
||||
/*.log
|
||||
/*.org
|
||||
|
||||
/ppl-*.txt
|
||||
/qnt-*.txt
|
||||
/perf-*.txt
|
||||
|
||||
*.bin
|
||||
|
||||
*.o
|
||||
*.a
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
|
@ -31,9 +21,8 @@ build-no-accel/
|
|||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
prompts/
|
||||
models/*
|
||||
wikitext-2-raw/
|
||||
*.bin
|
||||
|
||||
/main
|
||||
/quantize
|
||||
|
@ -44,7 +33,6 @@ wikitext-2-raw/
|
|||
/benchmark-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
/libllama.so
|
||||
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
|
@ -55,4 +43,8 @@ __pycache__
|
|||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
ppl-*.txt
|
||||
qnt-*.txt
|
||||
perf-*.txt
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
|
|
|
@ -182,7 +182,6 @@ if (LLAMA_CUBLAS)
|
|||
cmake_minimum_required(VERSION 3.17)
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "cuBLAS found")
|
||||
|
||||
|
@ -198,23 +197,6 @@ if (LLAMA_CUBLAS)
|
|||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
if (CMAKE_VERSION VERSION_LESS 3.25)
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile_static)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -lcufile)
|
||||
endif()
|
||||
else()
|
||||
message(FATAL "TODO: cufile on Windows")
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile_static)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cufile)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "cuBLAS not found")
|
||||
endif()
|
||||
|
|
2
Makefile
2
Makefile
|
@ -125,7 +125,7 @@ endif
|
|||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lcufile -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||
|
|
44
ggml-cuda.cu
44
ggml-cuda.cu
|
@ -2,13 +2,11 @@
|
|||
#include <cstdint>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <atomic>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cufile.h>
|
||||
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
|
@ -34,15 +32,6 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#define CUFILE_CHECK(status) \
|
||||
do { \
|
||||
CUfileError_t status_ = (status); \
|
||||
if (status_.err != CU_FILE_SUCCESS) { \
|
||||
fprintf(stderr, "cuFile error %d at %s:%d\n", status_.err, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
|
||||
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
||||
typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
||||
|
@ -383,7 +372,7 @@ struct cuda_buffer {
|
|||
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
|
||||
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
||||
|
||||
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
|
||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||
|
@ -402,7 +391,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|||
return ptr;
|
||||
}
|
||||
|
||||
void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
|
||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||
|
@ -442,9 +431,6 @@ void ggml_init_cublas() {
|
|||
|
||||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||
|
||||
// initialize cuFile for loading model parameters directly to VRAM
|
||||
CUFILE_CHECK(cuFileDriverOpen());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -909,32 +895,6 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
|
||||
CUfileDescr_t cf_descr;
|
||||
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
|
||||
const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644);
|
||||
cf_descr.handle.fd = fd_cf;
|
||||
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
|
||||
|
||||
CUfileHandle_t cf_handle;
|
||||
CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
|
||||
|
||||
if (status.err == CU_FILE_INTERNAL_ERROR) {
|
||||
fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
|
||||
"This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname);
|
||||
} else {
|
||||
for (int i = 0; i < num_tensors; ++i) {
|
||||
ggml_tensor * tensor = tensors[i];
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
const size_t offset = offsets[i];
|
||||
|
||||
size_t actual_size;
|
||||
void * buf = ggml_cuda_pool_malloc(size, &actual_size);
|
||||
cuFileRead(cf_handle, buf, size, offset, 0);
|
||||
tensor->data = buf;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
FILE * fp = fopen(fname, "rb");
|
||||
|
||||
for (int i = 0; i < num_tensors; ++i) {
|
||||
|
|
|
@ -14,8 +14,6 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|||
// TODO: export these with GGML_API
|
||||
void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
|
||||
void ggml_cuda_pool_free(void * ptr, size_t size);
|
||||
|
||||
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include <cuda_runtime.h>
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue