Compare commits

...
Sign in to create a new pull request.

21 commits

Author SHA1 Message Date
slaren
d273bfd2c9 allocator: cleanup, more comments 2023-07-22 15:05:24 +02:00
slaren
5141472e2b llama.cpp: print input/output buffers size 2023-07-22 13:31:06 +02:00
slaren
e2b9575951 allocator cleanup 2023-07-22 13:29:44 +02:00
slaren
7de7882537 allocator: fix partial offloading 2023-07-22 02:34:21 +02:00
slaren
e87840f9fd allocator: automatic inplace operations 2023-07-21 16:51:50 +02:00
slaren
3d679827e7 improved memory management fixes 2023-07-21 12:59:26 +02:00
slaren
56e9ae062c llama.cpp: partially restore state support, graph export 2023-07-21 12:39:51 +02:00
slaren
37d3f6a260 remove unused code 2023-07-21 02:33:06 +02:00
slaren
cd6f5dec92 improved memory management 2023-07-21 00:44:35 +02:00
slaren
de69f8f20d initial implementation of delayed graph allocation 2023-07-20 15:57:48 +02:00
slaren
cb205c0d13 automatically calculate compute buffer sizes (without graph allocator) 2023-07-20 02:42:36 +02:00
slaren
77ac8deaf1 llama.cpp: remove backend-specific code where possible 2023-07-20 01:01:51 +02:00
slaren
295f85654a allocators wip
renamed ggml_backend functions
changed ggml_buffer and ggml_backend to always be used as pointers
rename ggml_tensor::params -> op_params
2023-07-19 02:43:44 +02:00
slaren
1102ff56db fix double-free with --no-mmap 2023-07-17 12:00:17 +02:00
slaren
4e94af3060 improve layer backend printing with ranges 2023-07-17 11:53:01 +02:00
slaren
c2beeb8e3a only allocate as much memory as is required in each backend for the model 2023-07-17 11:21:32 +02:00
slaren
9c72e7e916 rebase to master (except ggml-cuda) 2023-07-16 15:10:46 +02:00
slaren
33ab185dd1 fix NVCC version on Makefile, __halves2half2 -> make_half2 2023-07-16 14:56:52 +02:00
slaren
24cc6f008f minor fixes 2023-07-16 14:56:52 +02:00
slaren
5765d7a587 restore simple.cpp for now 2023-07-16 14:56:52 +02:00
slaren
0d2b66c638 ggml backend interface wip
refactor ggml-cuda
2023-07-16 14:56:46 +02:00
15 changed files with 5018 additions and 4806 deletions

View file

@ -308,13 +308,13 @@ jobs:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cublas:
windows-latest-cmake-cuda:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
build: ['cublas']
build: ['cuda']
steps:
- name: Clone
@ -333,7 +333,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON
cmake --build . --config Release
- name: Get commit hash
@ -395,7 +395,7 @@ jobs:
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
- windows-latest-cmake-cuda
steps:
- name: Download artifacts

View file

@ -67,7 +67,7 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@ -239,18 +239,18 @@ if (LLAMA_K_QUANTS)
endif()
endif()
if (LLAMA_CUBLAS)
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
message(STATUS "CUDA found")
enable_language(CUDA)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
@ -280,7 +280,7 @@ if (LLAMA_CUBLAS)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "cuBLAS not found")
message(WARNING "CUDA not found")
endif()
endif()

View file

@ -55,6 +55,12 @@ else
CXXFLAGS += -DNDEBUG
endif
ifdef LLAMA_SANITIZE
CFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
CXXFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
LDFLAGS += -g -fsanitize=$(LLAMA_SANITIZE)
endif
ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
@ -163,13 +169,17 @@ ifdef LLAMA_BLIS
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
ifdef LLAMA_CUDA
CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler
NVCCV := $(shell $(NVCC) --version | tail -n 1)
ifdef LLAMA_DEBUG
NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else
@ -198,10 +208,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda-kern.h ggml-cuda-quant.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
@ -275,6 +284,9 @@ $(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
ifdef LLAMA_CUDA
$(info I NVCC: $(NVCCV))
endif # LLAMA_CUDA
$(info )
#
@ -284,6 +296,12 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
# temporary, probably will be added to ggml.c
ggml-backend.o: ggml-backend.c ggml-backend.h ggml.h
$(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-backend.o
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

View file

@ -327,24 +327,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
std::string arg_next = argv[i];
// split string by , and /
@ -361,14 +361,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUDA
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUDA
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {

1014
ggml-backend.c Normal file

File diff suppressed because it is too large Load diff

162
ggml-backend.h Normal file
View file

@ -0,0 +1,162 @@
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_backend;
// backend buffer
typedef void * ggml_buffer_context_t;
struct ggml_backend_buffer;
struct ggml_backend_buffer_interface {
// allocator functions
void (*free_buffer) (struct ggml_backend_buffer * alloc);
void (*alloc_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
void (*free_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
void (*reset) (struct ggml_backend_buffer * alloc);
// functions overriden by the backend
size_t (*get_alloc_size)(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // pre-allocation callback
void (*init_tensor) (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // post-allocation callback
void (*free_data) (struct ggml_backend_buffer * alloc); // free backend-specific data // TODO: better name
};
struct ggml_backend_buffer {
struct ggml_backend_buffer_interface interface;
ggml_buffer_context_t context;
struct ggml_backend * backend;
void * backend_data;
bool measure;
size_t max_size;
};
// backend buffer helper functions
GGML_API void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc);
static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); }
static inline void ggml_backend_buffer_tensor_free(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
// default buffer allocator
GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment);
// buffer
// buffers have space for the tensor structs in host memory, and tensor data in backend-specific memory
struct ggml_buffer {
// host memory
size_t mem_size;
void * mem_buffer;
// tensor data
struct ggml_backend_buffer * backend_buffer;
};
GGML_API struct ggml_buffer * ggml_buffer_alloc (struct ggml_backend * backend, size_t size, size_t max_tensors);
GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors);
// measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation
GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
// backend
typedef void * ggml_backend_context_t;
typedef void * ggml_graph_plan_t;
struct ggml_backend_interface {
const char * (*get_name)(struct ggml_backend * backend);
void (*free)(struct ggml_backend * backend);
// buffer allocation
struct ggml_backend_buffer * (*alloc_buffer)(struct ggml_backend * backend, size_t size);
// tensor data access
// these functions can be asynchronous. helper functions are provided for synchronous access that automatically call synchronize
void (*set_tensor_async)(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*synchronize) (struct ggml_backend * backend);
// (optional) copy tensor between different backends, allow for single-copy tranfers
void (*cpy_tensor_from)(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
// compute graph with a plan
ggml_graph_plan_t (*graph_plan_create) (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
void (*graph_plan_free) (struct ggml_backend * backend, ggml_graph_plan_t plan);
void (*graph_plan_compute)(struct ggml_backend * backend, ggml_graph_plan_t plan);
// compute graph without a plan
void (*graph_compute) (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
// check if a backend supports a given operation
// this could be used to fallback automatically to the CPU backend if a backend doesn't support an operation
// bool (*supports_op)(struct ggml_backend * backend, struct ggml_tensor * op);
};
struct ggml_backend {
struct ggml_backend_interface interface;
ggml_backend_context_t context;
};
// backend helper functions
static inline const char * ggml_backend_name(struct ggml_backend * backend) { return backend->interface.get_name(backend); }
static inline void ggml_backend_free(struct ggml_backend * backend) { backend->interface.free(backend); }
static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); }
static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); }
static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
static inline void ggml_backend_synchronize(struct ggml_backend * backend) { backend->interface.synchronize(backend); }
static inline ggml_graph_plan_t ggml_backend_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); }
static inline void ggml_backend_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); }
static inline void ggml_backend_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); }
static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); }
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// CPU backend
GGML_API struct ggml_backend * ggml_backend_cpu_init(void);
GGML_API void ggml_backend_cpu_set_n_threads(struct ggml_backend * backend_cpu, int n_threads);
///////////////////////////
// graph splitting
#define GGML_MAX_SPLITS 200
#define GGML_MAX_SPLIT_INPUTS 4
struct ggml_graph_split {
char name[GGML_MAX_NAME];
struct ggml_context * ctx;
struct ggml_tensor * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
struct ggml_tensor * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
struct ggml_cgraph * graph;
};
// TODO: this shouldn't be fixed size, allocate from ggml_context
struct ggml_graph_splits {
int n_splits;
struct ggml_graph_split splits[GGML_MAX_SPLITS];
};
// TODO: allocate in ggml_context
struct ggml_graph_splits ggml_graph_split_init(void);
// this won't be needed once we can allocate graphs from a ggml_context
GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits);
// add a split to the graph - single and multiple inputs versions
GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...);
GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...);
// build graphs for all splits
GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output);
// compute
GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
// graph tensor allocator
GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
#ifdef __cplusplus
}
#endif

468
ggml-cuda-kern.h Normal file
View file

@ -0,0 +1,468 @@
// kernels for ggml-cuda
#include <cuda.h>
#include <cuda_fp16.h>
template<typename dst_t>
using to_t_cuda_t = void (*)(const void * x, dst_t * y, int k, cudaStream_t stream);
// support for vector types in generic code
template<typename T> struct vec2_t_impl;
template<> struct vec2_t_impl<half> { typedef half2 type; };
template<> struct vec2_t_impl<float> { typedef float2 type; };
template<typename T> using vec2_t = typename vec2_t_impl<T>::type;
template<typename T> inline __host__ __device__ vec2_t<T> make_vec2_t(const T & x, const T & y);
template<> inline __host__ __device__ vec2_t<half> make_vec2_t(const half & x, const half & y) { return make_half2 (x, y); }
template<> inline __host__ __device__ vec2_t<float> make_vec2_t(const float & x, const float & y) { return make_float2(x, y); }
// the cuda headers define operators for half2, but not for float2
// they are defined here to simplify generic code
inline __host__ __device__ float2 operator+(const float2 & a, const float2 & b) { return make_float2(a.x + b.x, a.y + b.y); }
inline __host__ __device__ float2 operator-(const float2 & a, const float2 & b) { return make_float2(a.x - b.x, a.y - b.y); }
inline __host__ __device__ float2 operator*(const float2 & a, const float2 & b) { return make_float2(a.x * b.x, a.y * b.y); }
inline __host__ __device__ float2 operator/(const float2 & a, const float2 & b) { return make_float2(a.x / b.x, a.y / b.y); }
inline __host__ __device__ float2 & operator+=( float2 & a, const float2 & b) { a.x += b.x; a.y += b.y; return a; }
inline __host__ __device__ float2 & operator-=( float2 & a, const float2 & b) { a.x -= b.x; a.y -= b.y; return a; }
inline __host__ __device__ float2 & operator*=( float2 & a, const float2 & b) { a.x *= b.x; a.y *= b.y; return a; }
inline __host__ __device__ float2 & operator/=( float2 & a, const float2 & b) { a.x /= b.x; a.y /= b.y; return a; }
template<typename dst_t>
using dequantize_kernel_t = void (*)(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v);
__device__ half sqrt(const half x) { return hsqrt(x); }
__device__ half exp(const half x) { return hexp(x); }
__device__ half2 exp(const half2 x) { return h2exp(x); }
__device__ half cos(const half x) { return hcos(x); }
__device__ half sin(const half x) { return hsin(x); }
__device__ half max(const half x, const half y) { return __hmax(x, y); }
__device__ half2 max(const half2 x, const half2 y) { return __hmax2(x, y); }
template<typename T> struct op_max { __device__ T operator()(T a, T b) const { return max(a, b); } };
template<typename T> struct op_sum { __device__ T operator()(T a, T b) const { return a + b; } };
template<template<typename> class op_t, typename T>
static inline __device__ T warp_reduce_all(T val) {
op_t<T> op;
#pragma unroll
for (int mask = warpSize/2; mask > 0; mask /= 2) {
val = op(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
}
return val;
}
template<typename T>
static __device__ T zero_init() { return T(0); }
template<>
__device__ half2 zero_init() { return half2(0.0f, 0.0f); }
template<template<typename> class op_t, typename T>
static __device__ T block_reduce_all(const T val, const T init = zero_init<T>()) {
const int warp_id = threadIdx.x / warpSize; // warp id within the block
const int lane_id = threadIdx.x % warpSize; // lane id within the warp
const int num_warps = blockDim.x / warpSize; // number of warps in the block
__shared__ T lane_result[32]; // max 32 warps per block
// reduce warps
T warp_reduction = warp_reduce_all<op_t>(val);
__syncthreads();
// first thread within a warp writes reduction to shared memory
if (lane_id == 0) {
lane_result[warp_id] = warp_reduction;
}
// wait for all warps to finish writing their reductions
__syncthreads();
// reduce the results of all warps
T block_reduction = init;
if (lane_id < num_warps) {
block_reduction = lane_result[lane_id];
}
block_reduction = warp_reduce_all<op_t>(block_reduction);
return block_reduction;
}
template<typename dst_t>
static __device__ void convert_fp16(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
const half * x = (const half *) vx;
v.x = (dst_t)(x[ib + iqs + 0]);
v.y = (dst_t)(x[ib + iqs + 1]);
}
template<typename dst_t>
static __device__ void convert_fp32(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
const float * x = (const float *) vx;
v.x = (dst_t)(x[ib + iqs + 0]);
v.y = (dst_t)(x[ib + iqs + 1]);
}
template<typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_mul_mat_p021(const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
const src0_t * x = vx;
// const int col_x = blockDim.x*blockIdx.x + threadIdx.x;
// const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
dst_t tmp = 0;
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
const int col_x = col_x0 + threadIdx.x;
if (col_x >= ncols_x) {
break;
}
// x is transposed and permuted
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
const dst_t xi = (dst_t)(x[ix]);
const int row_y = col_x;
// y is not transposed but permuted
const int iy = channel*nrows_y + row_y;
tmp += xi * y[iy];
}
// dst is not transposed and not permuted
const int idst = channel*nrows_dst + row_dst;
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (threadIdx.x == 0) {
dst[idst] = tmp;
}
}
template<typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_mul_mat_vec_nc(
const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x,
const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
const src0_t * x = vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
const int idst = channel*nrows_dst + row_dst;
dst_t tmp = 0;
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
const int col_x = col_x0 + threadIdx.x;
if (col_x >= ncols_x) {
break;
}
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
const dst_t xi = (dst_t)(x[ix]);
const int row_y = col_x;
const int iy = channel*nrows_y + row_y;
tmp += xi * y[iy];
}
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (threadIdx.x == 0) {
dst[idst] = tmp;
}
}
template <typename src_t, typename dst_t>
static __global__ void k_cpy(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) {
return;
}
const int i02 = i / (ne00*ne01);
const int i01 = (i - i02*ne01*ne00) / ne00;
const int i00 = i - i02*ne01*ne00 - i01*ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
const int i12 = i / (ne10*ne11);
const int i11 = (i - i12*ne10*ne11) / ne10;
const int i10 = i - i12*ne10*ne11 - i11*ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
*(dst_t *)(cdst + dst_offset) = *(const src_t *)(cx + x_offset);
}
template<typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_add(const src0_t * x, const src1_t * y, dst_t * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = (dst_t)x[i] + (dst_t)y[i];
}
template<typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_mul(const src0_t * x, const src1_t * y, dst_t * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= kx) {
return;
}
dst[i] = (dst_t)x[i] * (dst_t)y[i%ky];
}
template<typename src0_t, typename dst_t>
static __global__ void k_silu(const src0_t * x, dst_t * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = x[i] / (src0_t(1) + exp(-x[i]));
}
// TODO: unstable with f16 compute, using f32 compute for now
template<typename src0_t, typename dst_t>
static __global__ void k_rms_norm(const src0_t * x, dst_t * dst, const int ncols) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
const float eps = 1e-6;
float tmp = 0; // partial sum for thread in warp
for (int col = tid; col < ncols; col += WARP_SIZE) {
const float xi = x[row*ncols + col];
tmp += xi * xi;
}
// sum up partial sums
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
const float mean = tmp / (float)ncols;
const float scale = 1.0f / sqrtf(mean + eps);
for (int col = tid; col < ncols; col += WARP_SIZE) {
dst[row*ncols + col] = scale * (float)x[row*ncols + col];
}
}
template<typename src0_t, typename dst_t>
static __global__ void k_rope(const src0_t * x, dst_t * dst, const int ncols, const float p, const float theta_scale) {
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
if (col >= ncols) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col;
const dst_t theta = p * powf(theta_scale, col/2);
const dst_t sin_theta = sin(theta);
const dst_t cos_theta = cos(theta);
const dst_t x0 = x[i + 0];
const dst_t x1 = x[i + 1];
dst[i + 0] = (dst_t)x0*cos_theta - (dst_t)x1*sin_theta;
dst[i + 1] = (dst_t)x0*sin_theta + (dst_t)x1*cos_theta;
}
template<typename src0_t, typename dst_t>
static __global__ void k_diag_mask_inf(const src0_t * x, dst_t * dst, const int ncols, const int rows_per_channel, const int n_past) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int row = blockDim.y*blockIdx.y + threadIdx.y;
if (col >= ncols) {
return;
}
const int i = row*ncols + col;
//dst[i] = col > (n_past + row % rows_per_channel) ? (dst_t)-INFINITY : (dst_t)x[i];
dst[i] = (dst_t)x[i] - (dst_t)((col > n_past + row % rows_per_channel) * INT_MAX); // equivalent within rounding error but slightly faster on GPU
}
// TODO: numerically stable version - low prio since the softmax is computed in the fused attention kernel
// check: https://arxiv.org/pdf/2001.04438.pdf
template<typename src0_t, typename dst_t>
static __global__ void k_soft_max_orig(const src0_t * x, dst_t * dst, const int ncols) {
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int block_size = blockDim.x;
const int tid = threadIdx.x;
float tmp = 0;
for (int block_start = 0; block_start < ncols; block_start += block_size) {
const int col = block_start + tid;
if (col >= ncols) {
break;
}
const int i = row*ncols + col;
const float val = expf(x[i]);
tmp += val;
dst[i] = val;
}
// sum up partial sums
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
for (int block_start = 0; block_start < ncols; block_start += block_size) {
const int col = block_start + tid;
if (col >= ncols) {
break;
}
const int i = row*ncols + col;
dst[i] /= tmp;
}
}
template<typename src_t, typename dst_t, int pack_size, int block_size>
static __global__ void k_soft_max(const src_t * x, dst_t * dst, const int64_t nrows, const int64_t ncols) {
//assert(ncols % pack_size == 0);
const int tid = threadIdx.x;
const int num_packs = ncols / pack_size;
for (int row = blockIdx.x; row < nrows; row += gridDim.x) {
src_t th_max = -INFINITY;
// row max thread
#pragma unroll
for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
// load pack
src_t pack[pack_size];
#pragma unroll
for (int i = 0; i < pack_size; i++) {
pack[i] = x[row * ncols + pack_id * pack_size + i];
}
// reduce max pack
#pragma unroll
for (int i = 0; i < pack_size; ++i) {
th_max = max(th_max, pack[i]);
}
}
// reduce max row warp threads
src_t row_max = block_reduce_all<op_max>(th_max, (src_t)-INFINITY);
// row exp sum thread
src_t th_sum = 0;
#pragma unroll
for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
// load pack
src_t pack[pack_size];
#pragma unroll
for (int i = 0; i < pack_size; i++) {
pack[i] = x[row * ncols + pack_id * pack_size + i];
}
// reduce pack
#pragma unroll
for (int i = 0; i < pack_size; ++i) {
th_sum += exp(pack[i] - row_max);
}
}
// reduce row exp sum all threads
src_t row_sum = block_reduce_all<op_sum>(th_sum);
// store (row - row_max) / row exp sum
#pragma unroll
for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
// load pack
src_t pack[pack_size];
#pragma unroll
for (int i = 0; i < pack_size; i++) {
pack[i] = x[row * ncols + pack_id * pack_size + i];
}
// reduce pack
#pragma unroll
for (int i = 0; i < pack_size; ++i) {
pack[i] = exp(pack[i] - row_max) / row_sum;
}
// store pack
#pragma unroll
for (int i = 0; i < pack_size; i++) {
dst[row * ncols + pack_id * pack_size + i] = pack[i];
}
}
}
}
template<typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_scale(const src0_t * x, dst_t * dst, const src1_t * scale, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = (dst_t)(*scale) * (dst_t)x[i];
}
template<typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int row = blockDim.y*blockIdx.y + threadIdx.y;
if (col >= ncols) {
return;
}
const int r = y[row];
// copy x[r*ncols + col] to dst[row*ncols + col]
const int xi = r*ncols + col;
const int di = row*ncols + col;
const int ib = xi/qk; // block index
const int iqs = (xi%qk)/qr; // quant index
const int iybs = di - di%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize
vec2_t<dst_t> v;
dequantize_kernel(x, ib, iqs, v);
dst[iybs + iqs + 0] = v.x;
dst[iybs + iqs + y_offset] = v.y;
}

920
ggml-cuda-quant.h Normal file
View file

@ -0,0 +1,920 @@
// quants kernels for ggml-cuda
// QK = number of values after dequantization
// QR = QK / number of values before dequantization
// QI = number of 32 bit integers before dequantization
#define QK4_0 32
#define QR4_0 2
#define QI4_0 4
typedef struct {
half d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
#define QR4_1 2
#define QI4_1 4
typedef struct {
half d; // delta
half m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK5_0 32
#define QR5_0 2
#define QI5_0 4
typedef struct {
half d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
#define QK5_1 32
#define QR5_1 2
#define QI5_1 4
typedef struct {
half d; // delta
half m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
#define QK8_0 32
#define QR8_0 1
#define QI8_0 8
typedef struct {
half d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define QK8_1 32
#define QR8_1 1
#define QI8_1 8
typedef struct {
half d; // delta
half s; // unquantized sum
int8_t qs[QK8_0]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
//================================= k-quants
#define QK_K 256
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
typedef struct {
uint8_t hmask[QK_K/8];
uint8_t qs[QK_K/4]; // nibbles / quants
uint8_t scales[3*QK_K/64];
half d;
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales
half d; // delta
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
template<typename src1_t, typename dst_t>
using dot_kernel_k_t = void (*)(const void * vx, const int ib, const int iqs, const src1_t * y, dst_t & v);
template<typename dst_t>
using vec_dot_q_cuda_t = dst_t (*)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
// TODO: f16
template<typename src_t>
static __global__ void quantize_q8_1(const src_t * x, void * vy, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
block_q8_1 * y = (block_q8_1 *) vy;
const int ib = i / QK8_0; // block index
const int iqs = i % QK8_0; // quant index
const float xi = x[i];
float amax = fabsf(xi);
float sum = xi;
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
}
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib].qs[iqs] = q;
if (iqs > 0) {
return;
}
y[ib].d = d;
y[ib].s = sum;
}
template<typename dst_t>
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
const block_q4_0 * x = (const block_q4_0 *) vx;
const dst_t d = x[ib].d;
const uint8_t vui = x[ib].qs[iqs];
v.x = vui & 0xF;
v.y = vui >> 4;
const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(8, 8);
const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
v = (v - off2) * d2;
}
template<typename dst_t>
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
const block_q4_1 * x = (const block_q4_1 *) vx;
const dst_t d = x[ib].d;
const dst_t m = x[ib].m;
const uint8_t vui = x[ib].qs[iqs];
v.x = vui & 0xF;
v.y = vui >> 4;
const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
v = v * d2 + m2;
}
template<typename dst_t>
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
const block_q5_0 * x = (const block_q5_0 *) vx;
const dst_t d = x[ib].d;
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(16, 16);
const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
v = (v - off2) * d2;
}
template<typename dst_t>
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
const block_q5_1 * x = (const block_q5_1 *) vx;
const dst_t d = x[ib].d;
const dst_t m = x[ib].m;
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
v = v * d2 + m2;
}
template<typename dst_t>
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
const block_q8_0 * x = (const block_q8_0 *) vx;
const dst_t d = x[ib].d;
v.x = x[ib].qs[iqs + 0];
v.y = x[ib].qs[iqs + 1];
const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
v = v * d2;
}
//================================== k-quants
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
const int i = blockIdx.x;
const int tid = threadIdx.x;
const int n = tid/32;
const int l = tid - 32*n;
const int is = 8*n + l/16;
const block_q2_K * x = (const block_q2_K *) vx;
const uint8_t q = x[i].qs[32*n + l];
float * y = yy + i*QK_K + 128*n;
float dall = x[i].d;
float dmin = x[i].dmin;
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
}
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q2_K * x = (const block_q2_K *) vx;
// if n is 0, we want to do the lower 128, else the upper 128,
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
// y[l+16], y[l+48], y[l+80], y[l+112]
int n = iqs/128; // 0 or 1
int r = iqs - 128*n; // 0...120 in steps of 8
int l = r/8; // 0...15 in steps of 1
const float * y = yy + 128*n + l;
const uint8_t * q = x[ib].qs + 32*n + l;
const uint8_t * s = x[ib].scales + 8*n;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
result = sum;
}
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
int r = threadIdx.x/4;
int i = blockIdx.x;
int tid = r/2;
int is0 = r%2;
int l0 = 16*is0 + 4*(threadIdx.x%4);
int n = tid / 4;
int j = tid - 4*n;
const block_q3_K * x = (const block_q3_K *) vx;
uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0;
int shift = 2*j;
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
(x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
float d_all = x[i].d;
float dl = d_all * (us - 32);
float * y = yy + i*QK_K + 128*n + 32*j;
const uint8_t * q = x[i].qs + 32*n;
const uint8_t * hm = x[i].hmask;
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
}
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q3_K * x = (const block_q3_K *) vx;
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
uint32_t aux[3];
uint32_t utmp[4];
// if n is 0, we want to do the lower 128, else the upper 128,
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
// y[l+16], y[l+48], y[l+80], y[l+112]
int n = iqs/128; // 0 or 1
int r = iqs - 128*n; // 0...120 in steps of 8
int l = r/8; // 0...15 in steps of 1
const float * y = yy + 128*n + l;
const uint8_t * q = x[ib].qs + 32*n + l;
const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;
memcpy(aux, x[ib].scales, 12);
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
const float dall = x[ib].d;
const uint8_t m = 1 << (4*n);
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
result = sum * dall;
}
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) {
d = q[j] & 63; m = q[j + 4] & 63;
} else {
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
}
}
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
const block_q4_K * x = (const block_q4_K *) vx;
const int i = blockIdx.x;
//// assume 64 threads - this is very slightly better than the one below
//const int tid = threadIdx.x;
//const int il = tid/16;
//const int ir = tid%16;
//const int is = 2*il;
//const int n = 2;
// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int is = 2*il;
const int n = 4;
float * y = yy + i*QK_K + 64*il + n*ir;
const float dall = x[i].d;
const float dmin = x[i].dmin;
const uint8_t * q = x[i].qs + 32*il + n*ir;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[i].scales, sc, m);
const float d1 = dall * sc; const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[i].scales, sc, m);
const float d2 = dall * sc; const float m2 = dmin * m;
for (int l = 0; l < n; ++l) {
y[l + 0] = d1 * (q[l] & 0xF) - m1;
y[l +32] = d2 * (q[l] >> 4) - m2;
}
}
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q4_K * x = (const block_q4_K *) vx;
// iqs is in 0...248 in steps of 8 =>
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
const float * y = yy + 64*j + ir;
const uint8_t * q = x[ib].qs + 32*j + ir;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
}
result = sum;
}
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
const block_q5_K * x = (const block_q5_K *) vx;
const int i = blockIdx.x;
// assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x;
const int il = tid/16; // il is in 0...3
const int ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6
float * y = yy + i*QK_K + 64*il + 2*ir;
const float dall = x[i].d;
const float dmin = x[i].dmin;
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
const uint8_t * qh = x[i].qh + 2*ir;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[i].scales, sc, m);
const float d1 = dall * sc; const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[i].scales, sc, m);
const float d2 = dall * sc; const float m2 = dmin * m;
uint8_t hm = 1 << (2*il);
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
hm <<= 1;
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
}
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q5_K * x = (const block_q5_K *) vx;
// iqs is in 0...248 in steps of 8 =>
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
const float * y = yy + 64*j + ir;
const uint8_t * ql = x[ib].qs + 32*j + ir;
const uint8_t * qh = x[ib].qh + ir;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
uint8_t hm = 1 << is;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
}
hm <<= 1;
for (int k = 0; k < 4; ++k) {
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
}
result = sum;
}
template<typename dst_t>
static __global__ void dequantize_block_q6_K(const void * vx, dst_t * yy) {
const block_q6_K * x = (const block_q6_K *) vx;
const int i = blockIdx.x;
// assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x;
const int ip = tid/32; // ip is 0 or 1
const int il = tid - 32*ip; // 0...32
const int is = 8*ip + il/16;
// TODO: fp16 compute
dst_t * y = yy + i*QK_K + 128*ip + il;
const float d = x[i].d;
const uint8_t * ql = x[i].ql + 64*ip + il;
const uint8_t qh = x[i].qh[32*ip + il];
const int8_t * sc = x[i].scales + is;
y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
}
template<typename src1_t, typename dst_t>
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const src1_t * yy, dst_t * dst, const int ncols, int nrows) {
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const block_q6_K * x = (const block_q6_K *)vx + ib0;
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0;
#else
const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4;
#endif
const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is;
const int y_offset = 128*im + l0;
dst_t tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const src1_t * y = yy + i * QK_K + y_offset;
const uint8_t * ql = x[i].ql + ql_offset;
const uint8_t * qh = x[i].qh + qh_offset;
const int8_t * s = x[i].scales + s_offset;
const dst_t d = x[i].d;
#if K_QUANTS_PER_ITERATION == 1
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp += sum;
#else
dst_t sum = 0;
for (int l = 0; l < 4; ++l) {
sum += (dst_t)y[l+ 0] * (dst_t)s[0] * d * (dst_t)((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+ (dst_t)y[l+32] * (dst_t)s[2] * d * (dst_t)((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+ (dst_t)y[l+64] * (dst_t)s[4] * d * (dst_t)((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+ (dst_t)y[l+96] * (dst_t)s[6] * d * (dst_t)((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
}
tmp += sum;
#endif
}
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
template <typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
static __global__ void dequantize_block(const void * vx, dst_t * y, const int k) {
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
if (i >= k) {
return;
}
const int ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize
vec2_t<dst_t> v;
dequantize_kernel(vx, ib, iqs, v);
y[iybs + iqs + 0] = v.x;
y[iybs + iqs + y_offset] = v.y;
}
template<typename dst_t>
static __device__ __forceinline__ dst_t vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
int vi;
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
// subtract 8 from each quantized value
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
// SIMD dot product of quantized values
int sumi = __dp4a(vi0, ui0, 0);
sumi = __dp4a(vi1, ui1, sumi);
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
}
template<typename dst_t>
static __device__ __forceinline__ dst_t vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
const float m = bq4_1->m;
const float s = bq8_1->s;
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
// SIMD dot product of quantized values
int sumi = __dp4a(vi0, ui0, 0);
sumi = __dp4a(vi1, ui1, sumi);
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
}
template<typename dst_t>
static __device__ __forceinline__ dst_t vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
int qs;
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
}
template<typename dst_t>
static __device__ __forceinline__ dst_t vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
const float m = bq5_1->m;
const float s = bq8_1->s;
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
}
template<typename dst_t>
static __device__ __forceinline__ dst_t vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
int vi;
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
// SIMD dot product of quantized values
int sumi = __dp4a(vi, ui, 0);
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
}
template <typename dst_t, int qk, int qi, typename block_q_t, vec_dot_q_cuda_t<dst_t> vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, dst_t * dst, const int ncols, const int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = WARP_SIZE / qi;
// partial sum for each thread
float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
const int iby = i + threadIdx.x / qi; // y block index
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
tmp += (float)vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
}
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (threadIdx.x == 0) {
dst[row] = (dst_t)tmp;
}
}
template <typename src1_t, typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * vx, const src1_t * y, dst_t * dst, const int ncols, const int nrows) {
// qk = quantized weights per x block
// qr = number of quantized weights per data value in x block
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
}
const int tid = threadIdx.x;
const int iter_stride = 2*GGML_CUDA_DMMV_X;
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
const int y_offset = qr == 1 ? 1 : qk/2;
vec2_t<dst_t> tmp2 = make_vec2_t<dst_t>(0, 0); // partial sum for thread in warp
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = (row*ncols + col)/qk; // x block index
const int iqs = (col%qk)/qr; // x quant index
const int iybs = col - col%qk; // y block start index
// processing >2 values per i iter is faster for fast GPUs
#pragma unroll
for (int j = 0; j < vals_per_iter; j += 2) {
// process 2 vals per j iter
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
// dequantize
vec2_t<dst_t> xc;
dequantize_kernel(vx, ib, iqs + j/qr, xc);
// matrix multiplication
vec2_t<dst_t> yc = make_vec2_t<dst_t>(
y[iybs + iqs + j/qr + 0],
y[iybs + iqs + j/qr + y_offset]);
tmp2 += xc * yc;
}
}
// sum up partial sums and write back result
// TODO: reducing as half2 may be faster, but requires special handling for float2
dst_t tmp = tmp2.x + tmp2.y;
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
template <typename src1_t, typename dst_t, int n_thread, dot_kernel_k_t<src1_t, dst_t> dot_kernel>
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const src1_t * y, dst_t * dst, const int ncols) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
const int iter_stride = QK_K;
const int vals_per_iter = iter_stride / n_thread;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
dst_t tmp = 0; // partial sum for thread in warp
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = ib0 + col/QK_K; // x block index
const int iqs = col%QK_K; // x quant index
const int iybs = col - col%QK_K; // y block start index
dst_t v;
dot_kernel(vx, ib, iqs, y + iybs, v);
tmp += v;
}
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}

File diff suppressed because it is too large Load diff

View file

@ -6,30 +6,15 @@
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16
GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
GGML_API void ggml_cuda_host_register(void * ptr, size_t size);
GGML_API void ggml_cuda_host_unregister(void * ptr);
void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split);
// backend API
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
GGML_API struct ggml_backend * ggml_backend_cuda_init();
// TODO: export these with GGML_API
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_free_data(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef __cplusplus
}

558
ggml.c

File diff suppressed because it is too large Load diff

80
ggml.h
View file

@ -199,6 +199,7 @@
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
#define GGML_MAX_NAME 48
#define GGML_MAX_OP_PARAMS 32
#define GGML_DEFAULT_N_THREADS 4
@ -285,12 +286,6 @@ extern "C" {
GGML_TYPE_COUNT,
};
enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20,
};
// model file types
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
@ -405,8 +400,9 @@ extern "C" {
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
enum ggml_backend backend;
struct ggml_backend * backend;
enum ggml_type type;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -418,11 +414,18 @@ extern "C" {
// compute data
enum ggml_op op;
// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
bool is_param;
struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC];
bool visited; // used to build graphs
int n_children; // used by the allocator
int n_views;
// performance
int perf_runs;
int64_t perf_cycles;
@ -430,11 +433,11 @@ extern "C" {
void * data;
char name[GGML_MAX_NAME];
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8];
char name[GGML_MAX_NAME];
char padding[12];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -459,6 +462,7 @@ extern "C" {
struct ggml_cgraph {
int n_nodes;
int n_leafs;
bool closed;
struct ggml_tensor * nodes[GGML_MAX_NODES];
struct ggml_tensor * grads[GGML_MAX_NODES];
@ -470,23 +474,21 @@ extern "C" {
int64_t perf_time_us;
};
// scratch buffer
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
enum ggml_alloc_mode {
GGML_ALLOC_NONE, // do not allocate tensors
GGML_ALLOC_IMMEDIATE, // allocate tensors immediately
GGML_ALLOC_COMPUTE_SEQ, // delay allocation until graph build time, allocate tensors for sequential graph computation
//GGML_ALLOC_COMPUTE_PAR, // allocate tensors for parallel graph computation
};
// context parameters
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
struct ggml_buffer * buffer;
enum ggml_alloc_mode alloc_mode; // tensor allocation mode
enum ggml_type compute_type; // type of intermediate results
};
// compute types
// task types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
@ -547,19 +549,20 @@ extern "C" {
GGML_API size_t ggml_tensor_overhead(void);
// main
GGML_API struct ggml_init_params ggml_init_params_default(void);
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API void ggml_set_alloc_mode(struct ggml_context * ctx, enum ggml_alloc_mode mode);
// TODO: update for ggml_buffer
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
GGML_API struct ggml_buffer * ggml_get_buffer(const struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
@ -1121,6 +1124,17 @@ extern "C" {
int mode,
int n_ctx);
// custom RoPE
GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx);
// custom RoPE, in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
@ -1347,6 +1361,8 @@ extern "C" {
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
GGML_API void ggml_graph_close (struct ggml_cgraph * cgraph);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
@ -1561,9 +1577,8 @@ extern "C" {
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_vsx (void);
@ -1594,3 +1609,6 @@ extern "C" {
#ifdef __cplusplus
}
#endif
#include "ggml-backend.h"

View file

@ -203,6 +203,17 @@ struct llama_mmap {
}
}
void discard(void * addr, size_t len) {
// align to the page size
int page_size = sysconf(_SC_PAGESIZE);
addr = (void *) (((uintptr_t) addr) & ~(page_size - 1));
len = (len + page_size - 1) & ~(page_size - 1);
if (madvise(addr, len, MADV_DONTNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_DONTNEED) failed: %s\n",
strerror(errno));
}
}
~llama_mmap() {
munmap(addr, size);
}
@ -247,6 +258,10 @@ struct llama_mmap {
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
void discard(void * addr, size_t len) {
VirtualAlloc(addr, len, MEM_RESET, PAGE_NOACCESS);
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@ -262,6 +277,13 @@ struct llama_mmap {
throw std::runtime_error(std::string("mmap not supported"));
}
void discard(void * addr, size_t len) {
(void) addr;
(void) len;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
@ -419,28 +441,13 @@ struct llama_buffer {
llama_buffer() = default;
void resize(size_t len) {
#ifdef GGML_USE_METAL
free(addr);
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
}
else {
addr = NULL;
}
#else
delete[] addr;
addr = new uint8_t[len];
#endif
size = len;
}
~llama_buffer() {
#ifdef GGML_USE_METAL
free(addr);
#else
delete[] addr;
#endif
addr = NULL;
}
@ -451,54 +458,4 @@ struct llama_buffer {
llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) {
free();
addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size;
}
void free() {
if (addr) {
if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
}
addr = NULL;
}
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;
#endif
#endif

1634
llama.cpp

File diff suppressed because it is too large Load diff

View file

@ -2,12 +2,7 @@
#define LLAMA_H
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
@ -48,7 +43,7 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif