diff --git a/Makefile b/Makefile index 32598edfe..f01bf0c83 100644 --- a/Makefile +++ b/Makefile @@ -253,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif -ifdef LLAMA_CUDA_MMQ_Y - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) -else - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 -endif # LLAMA_CUDA_MMQ_Y #ifdef LLAMA_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS diff --git a/README.md b/README.md index 2ece294b7..6900b1152 100644 --- a/README.md +++ b/README.md @@ -406,7 +406,6 @@ Building the program with BLAS support may lead to some performance improvements ---> | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | diff --git a/ggml-alloc.c b/ggml-alloc.c index 5e1be61ff..4121f3dba 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -394,6 +394,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) if (parent == NULL) { break; } + + // if the node's data is external, then we cannot re-use it + if ((char *) parent->data < (char *) alloc->data || + (char *) parent->data >= ((char *) alloc->data + alloc->size)) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + struct hash_node * p_hn = hash_get(ht, parent); if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { if (ggml_is_view(parent)) { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9d42efb0d..6390b1158 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -14,6 +14,7 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_TURING 700 #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#ifndef GGML_CUDA_MMQ_Y -#define GGML_CUDA_MMQ_Y 64 -#endif // GGML_CUDA_MMQ_Y - // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; +static int g_device_count = -1; +static int g_main_device = 0; +static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = false; + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; + static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -1549,8 +1560,8 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it @@ -1884,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q4_0( +template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -1910,7 +1921,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -1920,39 +1931,30 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } + if (need_check) { + i = min(i, i_max); + } -// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (float *) x_dm; @@ -1960,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; } return vec_dot_q4_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -1987,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q4_1( +template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2011,7 +2013,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_1 * bx0 = (block_q4_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2027,7 +2029,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2044,27 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); int u[2*VDR_Q4_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; } return vec_dot_q4_1_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -2087,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q5_0( +template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2111,7 +2105,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bx0 = (block_q5_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2147,7 +2141,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; if (need_check) { @@ -2164,14 +2158,6 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; @@ -2181,12 +2167,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -2209,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q5_1( +template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2233,7 +2219,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bx0 = (block_q5_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2266,7 +2252,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2283,14 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2298,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -2323,21 +2301,21 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); } -static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q8_0( +template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2348,7 +2326,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bx0 = (block_q8_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2358,41 +2336,29 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; -// #if GGML_CUDA_MMQ_Y < 64 -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } -// #endif // GGML_CUDA_MMQ_Y < 64 + if (need_check) { + i = min(i, i_max); + } -// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2424,23 +2390,23 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } -static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q2_K( +template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2450,7 +2416,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ const block_q2_K * bx0 = (block_q2_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2466,8 +2432,8 @@ template static __device__ __forceinline__ void load_tiles_q2_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2479,7 +2445,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2496,14 +2462,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0); - const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; const float * y_df = (const float *) y_ds; @@ -2520,7 +2478,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k; + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } @@ -2551,12 +2509,12 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } -static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2564,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q3_K( +template static __device__ __forceinline__ void load_tiles_q3_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2579,7 +2537,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bx0 = (block_q3_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2596,8 +2554,8 @@ template static __device__ __forceinline__ void load_tiles_q3_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2609,7 +2567,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); if (need_check) { @@ -2623,7 +2581,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2652,14 +2610,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0); - const int kbx = k / QI3_K; const int ky = (k % QI3_K) * QR3_K; const float * x_dmf = (const float *) x_dm; @@ -2681,7 +2631,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( v[l] = __vsubss4(vll, vlh); } - const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K; + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } @@ -2778,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( #endif } -static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q4_K( +template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2804,7 +2754,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bx0 = (block_q4_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2820,8 +2770,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2833,8 +2783,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2858,14 +2808,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0); - int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; #pragma unroll @@ -2876,7 +2818,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k; + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } @@ -2969,23 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #endif } -static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q5_K( +template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2995,7 +2937,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bx0 = (block_q5_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3024,8 +2966,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3037,8 +2979,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3062,18 +3004,10 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0); - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k; + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } @@ -3103,23 +3037,23 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } -static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q6_K( +template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -3129,7 +3063,7 @@ template static __device__ __forceinline__ void load_tiles_q6_ const block_q6_K * bx0 = (block_q6_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3159,8 +3093,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3172,8 +3106,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3189,25 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k; + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -template static __global__ void mul_mat_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, @@ -3222,14 +3148,11 @@ static __global__ void mul_mat_q( const int & ncols_dst = ncols_y; - const int tid_x = threadIdx.x; - const int tid_y = threadIdx.y; - - const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y; + const int row_dst_0 = blockIdx.x*mmq_y; const int & row_x_0 = row_dst_0; - const int row_dst = row_dst_0 + tid_x; + const int row_dst = row_dst_0 + threadIdx.x; - const int col_dst_0 = blockIdx.y*WARP_SIZE; + const int col_dst_0 = blockIdx.y*mmq_x; const int & col_y_0 = col_dst_0; int * tile_x_ql = nullptr; @@ -3239,64 +3162,65 @@ static __global__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; - __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; - - float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x); + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); +#pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + tid_x; + const int kqs = ir*WARP_SIZE + threadIdx.x; const int kbxd = kqs / QI8_1; - for (int i = 0; i < WARP_SIZE; i += 8) { - const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } - } - for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { - const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; - const int kby = tid_x % blocks_per_tile_y_col; - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; - } - } - - __syncthreads(); - -#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal #pragma unroll -#endif // __CUDA_ARCH__ >= 700 - for (int k = 0; k < WARP_SIZE; k += vdr) { -#pragma unroll - for (int j = 0; j < WARP_SIZE; j += 8) { -#pragma unroll - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - tid_x + i, tid_y + j, k); + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src).x; } } - } - __syncthreads(); + __syncthreads(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } + } + } + + __syncthreads(); + } } @@ -3304,15 +3228,15 @@ static __global__ void mul_mat_q( return; } - for (int j = 0; j < WARP_SIZE; j += 8) { - const int col_dst = col_dst_0 + j + tid_y; + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; if (col_dst >= ncols_dst) { return; } - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8]; + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps]; } } } @@ -4014,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4032,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } + } } @@ -4050,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4068,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4086,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4104,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4122,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 128; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4140,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 32; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4158,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 128; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4176,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (compute_capability >= CC_TURING) { + const int mmq_x = 64; + const int mmq_y = 64; + const int nwarps = 4; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } else { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const int mmq_x = 32; + const int mmq_y = 64; + const int nwarps = 8; + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } } } @@ -4361,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { } -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default -static size_t g_scratch_offset = 0; - -static int g_device_count = -1; -static int g_main_device = 0; -static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = false; - -static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; - void ggml_init_cublas() { static bool initialized = false; @@ -4730,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q( (void) i1; } +static int64_t get_row_rounding(ggml_type type) { + int max_compute_capability = INT_MIN; + for (int id = 0; id < g_device_count; ++id) { + if (max_compute_capability < g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + max_compute_capability = g_compute_capabilities[id]; + } + } + + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +} + inline void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -5130,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t row_low, row_high; if (split) { + const int64_t rounding = get_row_rounding(src0->type); + row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows0; } else { row_high = nrows0*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { row_low = 0; @@ -5616,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { row_low = 0; row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows; } else { row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { GGML_ASSERT(false); diff --git a/llama.cpp b/llama.cpp index fbe04958b..a27226cf6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,6 +56,13 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static void llama_log_internal(llama_log_level level, const char* format, ...); +static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); +#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) + + #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) #include "ggml-alloc.h" #define LLAMA_USE_ALLOCATOR @@ -438,6 +445,14 @@ struct llama_context { } }; +struct llama_state { + // We save the log callback globally + llama_log_callback log_callback = llama_log_callback_default; + void * log_callback_user_data = nullptr; +}; +// global state +static llama_state g_state; + template static T checked_mul(T a, T b) { T ret = a * b; @@ -504,7 +519,7 @@ struct llama_file_loader { llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) : file(fname, "rb") { - fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname); read_magic(); read_hparams(); read_vocab(); @@ -619,7 +634,7 @@ struct llama_file_saver { llama_file_loader * any_file_loader; llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) : file(fname, "wb"), any_file_loader(any_file_loader) { - fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname); write_magic(); write_hparams(new_ftype); write_vocab(); @@ -640,7 +655,7 @@ struct llama_file_saver { } void write_vocab() { if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { - fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); } uint32_t n_vocab = any_file_loader->hparams.n_vocab; for (uint32_t i = 0; i < n_vocab; i++) { @@ -831,7 +846,7 @@ struct llama_model_loader { uint8_t byte = lt.data[i]; sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash } - fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, llama_format_tensor_shape(lt.ne).c_str(), lt.size); } @@ -864,7 +879,7 @@ static bool kv_cache_init( cache.ctx = ggml_init(params); if (!cache.ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); return false; } @@ -1076,7 +1091,7 @@ static void llama_model_load_internal( LLAMA_ASSERT(hparams.n_head % n_gqa == 0); hparams.n_head_kv = hparams.n_head / n_gqa; if (model.type == e_model::MODEL_65B && n_gqa == 8) { - fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); model.type = e_model::MODEL_70B; hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model } @@ -1092,22 +1107,22 @@ static void llama_model_load_internal( //const uint32_t n_ff = 28672; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version)); + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); + LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -1135,7 +1150,7 @@ static void llama_model_load_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); - fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { @@ -1160,13 +1175,13 @@ static void llama_model_load_internal( (void) main_gpu; (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) - fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU #else @@ -1271,14 +1286,14 @@ static void llama_model_load_internal( const size_t mem_required_state = scale*hparams.kv_size(); - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); (void) vram_scratch; (void) n_batch; #ifdef GGML_USE_CUBLAS if (low_vram) { - fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); + LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch } else { const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); @@ -1286,7 +1301,7 @@ static void llama_model_load_internal( vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", + LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", __func__, vram_scratch_base / kB, vram_scratch_per_context, (vram_scratch + MB - 1) / MB); // round up } @@ -1296,9 +1311,9 @@ static void llama_model_load_internal( #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } size_t vram_kv_cache = 0; @@ -1307,17 +1322,17 @@ static void llama_model_load_internal( const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; if (n_gpu_layers > (int) hparams.n_layer + 1) { if (low_vram) { - fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); + LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); vram_kv_cache += hparams.kv_size() / 2; } } if (n_gpu_layers > (int) hparams.n_layer + 2) { if (low_vram) { - fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); + LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); vram_kv_cache += hparams.kv_size() / 2; } } @@ -1326,9 +1341,9 @@ static void llama_model_load_internal( const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS - fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - fprintf(stderr, "%s: total VRAM used: %zu MB\n", + LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n", __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up #else (void) n_gpu_layers; @@ -1387,7 +1402,7 @@ static bool llama_model_load( use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { - fprintf(stderr, "error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; } } @@ -1751,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph( } #if 0 - printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, + LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0, lctx.get_buf_max_mem(1)/1024.0/1024.0, @@ -1814,7 +1829,7 @@ static bool llama_eval_internal( ggml_allocr_alloc_graph(lctx.alloc, gf); #endif - // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance @@ -2002,7 +2017,7 @@ struct llama_tokenizer { left_sym.n += right_sym.n; right_sym.n = 0; - //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); // remove the right sym from the chain left_sym.next = right_sym.next; @@ -3010,7 +3025,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s tensor.data = read_data.addr; model_loader->load_data_for(tensor); - printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", + LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); @@ -3032,7 +3047,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; - printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0); } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS @@ -3067,17 +3082,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s int nx = tensor.ne.at(0); int ny = tensor.ne.at(1); if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); convert_incompatible_tensor = true; } } if (convert_incompatible_tensor) { if (tensor.name == "output.weight") { new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - fprintf(stderr, "F16 will be used for this tensor instead.\n"); + LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); } else if (tensor.name == "tok_embeddings.weight") { new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - fprintf(stderr, "Q4_0 will be used for this tensor instead.\n"); + LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); } else { throw std::runtime_error("Unsupported tensor size encountered\n"); } @@ -3097,7 +3112,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.addr; } - printf("quantizing to %s .. ", ggml_type_name(new_type)); + LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); fflush(stdout); work.resize(nelements * 4); // upper bound on size @@ -3147,7 +3162,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } - printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -3156,18 +3171,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tot_count > 0) { for (size_t i = 0; i < hist_cur.size(); i++) { - printf("%5.3f ", hist_cur[i] / float(nelements)); + LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); } } - printf("\n"); + LLAMA_LOG_INFO("\n"); } total_size_org += tensor.size; total_size_new += new_size; file_saver.write_tensor(tensor, new_type, new_data, new_size); } - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); { int64_t sum_all = 0; @@ -3176,11 +3191,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if (sum_all > 0) { - printf("%s: hist: ", __func__); + LLAMA_LOG_INFO("%s: hist: ", __func__); for (size_t i = 0; i < hist_all.size(); i++) { - printf("%5.3f ", hist_all[i] / float(sum_all)); + LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all)); } - printf("\n"); + LLAMA_LOG_INFO("\n"); } } } @@ -3204,8 +3219,8 @@ struct llama_model * llama_load_model_from_file( params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; - fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; } @@ -3238,10 +3253,9 @@ struct llama_context * llama_new_context_with_model( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { *cur_percentage_p = percentage; - fprintf(stderr, "."); - fflush(stderr); + LLAMA_LOG_INFO("."); if (percentage >= 100) { - fprintf(stderr, "\n"); + LLAMA_LOG_INFO("\n"); } } }; @@ -3255,14 +3269,14 @@ struct llama_context * llama_new_context_with_model( // reserve memory for context buffers if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { - fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); + LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } const auto & hparams = ctx->model.hparams; @@ -3296,14 +3310,14 @@ struct llama_context * llama_new_context_with_model( // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // debug - for comparison with scratch buffer //size_t prev_req = // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + // MEM_REQ_SCRATCH1().at(ctx->model.type) + // MEM_REQ_EVAL().at(ctx->model.type); - //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); // recreate allocator with exact memory requirements ggml_allocr_free(ctx->alloc); @@ -3339,13 +3353,13 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - fprintf(stderr, "%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ } LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); @@ -3399,19 +3413,19 @@ int llama_model_quantize( llama_model_quantize_internal(fname_inp, fname_out, params); return 0; } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what()); + LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); return 1; } } int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { - fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); const int64_t t_start_lora_us = ggml_time_us(); auto fin = std::ifstream(path_lora, std::ios::binary); if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); + LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); return 1; } @@ -3420,14 +3434,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const uint32_t magic; fin.read((char *) &magic, sizeof(magic)); if (magic != LLAMA_FILE_MAGIC_GGLA) { - fprintf(stderr, "%s: bad file magic\n", __func__); + LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); return 1; } uint32_t format_version; fin.read((char *) &format_version, sizeof(format_version)); if (format_version != 1) { - fprintf(stderr, "%s: unsupported file version\n", __func__ ); + LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); return 1; } } @@ -3438,7 +3452,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const fin.read((char *) &lora_alpha, sizeof(lora_alpha)); float scaling = (float)lora_alpha / (float)lora_r; - fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); // create a temporary ggml context to store the lora tensors @@ -3464,7 +3478,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_context * base_ctx = NULL; llama_buffer base_buf; if (path_base_model) { - fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); size_t ctx_size; @@ -3521,17 +3535,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const const std::string lora_suffix = ".lora"; size_t pos = name.rfind(lora_suffix); if (pos == std::string::npos) { - fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); return 1; } std::string lora_type = name.substr(pos + lora_suffix.length()); std::string base_name = name; base_name.erase(pos); - // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); if (model_tensors.find(base_name) == model_tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); return 1; } @@ -3542,7 +3556,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const case 1: wtype = GGML_TYPE_F16; break; default: { - fprintf(stderr, "%s: invalid tensor data type '%d'\n", + LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", __func__, ftype); return false; } @@ -3552,7 +3566,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); } else { - fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); return 1; } ggml_set_name(lora_tensor, "lora_tensor"); @@ -3590,7 +3604,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const if (model_loader) { // load from base model if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { - fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; @@ -3606,8 +3620,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const if (ggml_is_quantized(base_t->type)) { if (!warned) { - fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); warned = true; } } @@ -3621,8 +3635,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const ggml_set_name(loraB, "loraB"); if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); return 1; } @@ -3667,7 +3681,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const n_tensors++; if (n_tensors % 4 == 0) { - fprintf(stderr, "."); + LLAMA_LOG_INFO("."); } } } @@ -3679,7 +3693,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); return 0; } @@ -3688,7 +3702,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor try { return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } @@ -3697,7 +3711,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha try { return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } @@ -3979,7 +3993,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const uint32_t version = file.read_u32(); if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); return false; } @@ -3987,7 +4001,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c file.read_raw(&session_hparams, sizeof(llama_hparams)); if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); return false; } } @@ -3997,7 +4011,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const uint32_t n_token_count = file.read_u32(); if (n_token_count > n_token_capacity) { - fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); return false; } @@ -4011,7 +4025,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const size_t n_state_size_max = llama_get_state_size(ctx); if (n_state_size_cur > n_state_size_max) { - fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); return false; } @@ -4028,7 +4042,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi try { return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - fprintf(stderr, "error loading session file: %s\n", err.what()); + LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); return false; } } @@ -4060,7 +4074,7 @@ int llama_eval( int n_threads, int pp_threads) { if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, pp_threads, nullptr)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } @@ -4083,7 +4097,7 @@ int llama_eval_embd( int n_threads, int pp_threads) { if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, pp_threads, nullptr)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } @@ -4104,7 +4118,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { const std::vector tmp(n_batch, llama_token_bos()); if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, 1, fname)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } @@ -4120,7 +4134,7 @@ int llama_tokenize_with_model( auto res = llama_tokenize(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { - fprintf(stderr, "%s: too many tokens\n", __func__); + LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } @@ -4237,15 +4251,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); - fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); - fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("\n"); + LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); + LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { @@ -4281,3 +4295,44 @@ const char * llama_print_system_info(void) { const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } + + +void llama_log_set(llama_log_callback log_callback, void * user_data) { + g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; + g_state.log_callback_user_data = user_data; +} + +#if defined(_MSC_VER) && !defined(vsnprintf) +#define vsnprintf _vsnprintf +#endif + +static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_state.log_callback(level, buffer, g_state.log_callback_user_data); + } else { + char* buffer2 = new char[len+1]; + vsnprintf(buffer2, len+1, format, args_copy); + buffer2[len] = 0; + g_state.log_callback(level, buffer2, g_state.log_callback_user_data); + delete[] buffer2; + } + va_end(args_copy); +} + +static void llama_log_internal(llama_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + llama_log_internal_v(level, format, args); + va_end(args); +} + +static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} diff --git a/llama.h b/llama.h index c57e84654..99a0d372e 100644 --- a/llama.h +++ b/llama.h @@ -86,7 +86,20 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + enum llama_log_level { + LLAMA_LOG_LEVEL_ERROR = 2, + LLAMA_LOG_LEVEL_WARN = 3, + LLAMA_LOG_LEVEL_INFO = 4 + }; + + // Signature for logging events + // Note that text includes the new line character at the end for most events. + // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it + // if it exists. + // It might not exist for progress report where '.' is output repeatedly. + typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data); + + struct llama_context_params { uint32_t seed; // RNG seed, -1 for random int32_t n_ctx; // text context int32_t n_batch; // prompt processing batch size @@ -195,6 +208,10 @@ extern "C" { int32_t n_eval; }; + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API int llama_max_devices(); LLAMA_API struct llama_context_params llama_context_default_params();