CUDA: faster q2_K, q3_K MMQ + int8 tensor cores
This commit is contained in:
parent
172c825684
commit
d962a56baa
7 changed files with 438 additions and 339 deletions
|
@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
info.default_tensor_split[id] = total_vram;
|
info.default_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
|
||||||
|
info.devices[id].nsm = prop.multiProcessorCount;
|
||||||
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||||
#else
|
#else
|
||||||
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
||||||
info.devices[id].nsm = prop.multiProcessorCount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
|
|
@ -73,6 +73,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||||
const dim3 block_nums(1, nrows, 1);
|
const dim3 block_nums(1, nrows, 1);
|
||||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||||
|
|
||||||
|
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||||
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
||||||
|
|
||||||
if (order == GGML_SORT_ORDER_ASC) {
|
if (order == GGML_SORT_ORDER_ASC) {
|
||||||
|
|
|
@ -661,6 +661,7 @@ struct ggml_cuda_device_info {
|
||||||
int cc; // compute capability
|
int cc; // compute capability
|
||||||
int nsm; // number of streaming multiprocessors
|
int nsm; // number of streaming multiprocessors
|
||||||
size_t smpb; // max. shared memory per block
|
size_t smpb; // max. shared memory per block
|
||||||
|
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||||
bool vmm; // virtual memory support
|
bool vmm; // virtual memory support
|
||||||
size_t vmm_granularity; // granularity of virtual memory
|
size_t vmm_granularity; // granularity of virtual memory
|
||||||
size_t total_vram;
|
size_t total_vram;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,4 +1,5 @@
|
||||||
#include "quantize.cuh"
|
#include "quantize.cuh"
|
||||||
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
|
||||||
|
@ -37,7 +38,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool need_sum>
|
template <int need_sum>
|
||||||
static __global__ void quantize_mmq_q8_1(
|
static __global__ void quantize_mmq_q8_1(
|
||||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||||
|
|
||||||
|
@ -60,24 +61,48 @@ static __global__ void quantize_mmq_q8_1(
|
||||||
|
|
||||||
amax = warp_reduce_max(amax);
|
amax = warp_reduce_max(amax);
|
||||||
|
|
||||||
float sum;
|
|
||||||
if (need_sum) {
|
|
||||||
sum = warp_reduce_sum(xi);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float d = amax / 127;
|
const float d = amax / 127;
|
||||||
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||||
|
|
||||||
y[ib].qs[iqs] = q;
|
y[ib].qs[iqs] = q;
|
||||||
|
|
||||||
|
static_assert(need_sum >= 0 && need_sum <= 2, "Invalid need_sum value.");
|
||||||
|
if (need_sum == 0) {
|
||||||
|
if (iqs % QK8_1 != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
||||||
|
} else if (need_sum == 1) {
|
||||||
|
const float sum = warp_reduce_sum(xi);
|
||||||
|
|
||||||
if (iqs % QK8_1 != 0) {
|
if (iqs % QK8_1 != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need_sum) {
|
|
||||||
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
||||||
} else {
|
} else {
|
||||||
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
float sum = xi;
|
||||||
|
|
||||||
|
// Calculate sum per 16 values:
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 8; mask > 0; mask >>= 1) {
|
||||||
|
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (iqs % (QK8_1/2) != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int8_t * si = (int8_t *) &y[ib].ds[iqs/QK8_1].y;
|
||||||
|
const int tmp = roundf(amax == 0.0f ? 0.0f : -8*sum/amax);
|
||||||
|
si[(iqs % QK8_1)/(QK8_1/2)] = min(tmp, 127);
|
||||||
|
|
||||||
|
if (iqs % QK8_1 != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
reinterpret_cast<half&>(y[ib].ds[iqs/QK8_1].x) = d;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,9 +129,14 @@ void quantize_mmq_q8_1_cuda(
|
||||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
const dim3 num_blocks(block_num_x, kx1, channels);
|
const dim3 num_blocks(block_num_x, kx1, channels);
|
||||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||||
if (mmq_need_sum(type_x)) {
|
const int need_sum = mmq_need_sum(type_x);
|
||||||
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
if (need_sum == 0) {
|
||||||
|
quantize_mmq_q8_1<0><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||||
|
} else if (need_sum == 1) {
|
||||||
|
quantize_mmq_q8_1<1><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||||
|
} else if (need_sum == 2) {
|
||||||
|
quantize_mmq_q8_1<2><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||||
} else {
|
} else {
|
||||||
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -130,6 +130,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||||
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||||
switch (ncols_x) {
|
switch (ncols_x) {
|
||||||
case 32:
|
case 32:
|
||||||
|
|
|
@ -265,36 +265,32 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
||||||
|
|
||||||
// contiguous u/y values
|
// contiguous u/y values
|
||||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
||||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const half2 & ds8) {
|
||||||
const half2 & dm2, const float & d8) {
|
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
int sumi_d = 0;
|
float sumf_d = 0.0f;
|
||||||
int sumi_m = 0;
|
float sumf_m = 0.0f;
|
||||||
|
|
||||||
|
const float d8 = __low2float(ds8);
|
||||||
|
const int8_t * s8i = (const int8_t *) &ds8.y;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
||||||
int sumi_d_sc = 0;
|
const float2 dm2f = __half22float2(dm2[i0/(QI8_1/2)]);
|
||||||
|
int sumi_d = 0;
|
||||||
const int sc = scales[i0 / (QI8_1/2)];
|
|
||||||
|
|
||||||
// fill int with 4x m
|
|
||||||
int m = sc >> 4;
|
|
||||||
m |= m << 8;
|
|
||||||
m |= m << 16;
|
|
||||||
|
|
||||||
|
const int vi0 = v[i0/(QI8_1/2)];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||||
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
|
||||||
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product
|
||||||
}
|
}
|
||||||
|
|
||||||
sumi_d += sumi_d_sc * (sc & 0xF);
|
sumf_d += dm2f.x * sumi_d;
|
||||||
|
sumf_m += dm2f.y * s8i[i0/(QI8_1/2)];
|
||||||
}
|
}
|
||||||
|
|
||||||
const float2 dm2f = __half22float2(dm2);
|
return d8*(sumf_d + (127.0f/8.0f)*sumf_m);
|
||||||
|
|
||||||
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
@ -352,8 +348,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
||||||
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
||||||
int sumi_sc = 0;
|
int sumi_sc = 0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||||
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
|
||||||
|
sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product
|
||||||
}
|
}
|
||||||
|
|
||||||
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue