iq3_s_mult: ifdef'd slow / fast versions
This commit is contained in:
parent
726aed307a
commit
b6402fa757
2 changed files with 71 additions and 91 deletions
49
ggml-cuda.cu
49
ggml-cuda.cu
|
@ -2370,10 +2370,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|||
|
||||
}
|
||||
|
||||
//#define IQ3S_MULTIPLIER 190842953LL
|
||||
|
||||
//#define IQ3S_MULTIPLIER 5718026
|
||||
#define IQ3S_MULTIPLIER 898886
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
// Better (lower PPL), but requires more bit twidling, so slower
|
||||
#define IQ3S_MULTIPLIER 190842953LL
|
||||
#else
|
||||
//#define IQ3S_MULTIPLIER 898886
|
||||
#define IQ3S_MULTIPLIER 842866
|
||||
#endif
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
@ -2391,17 +2394,18 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|||
const int8_t * grid = (const int8_t *)aux32;
|
||||
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
//aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
//aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
#else
|
||||
aux32[0] = (((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = (((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
#endif
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
//aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
//aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
//aux32[0] = ((__vsub4(aux32[0], 0x01010101) >> 1) << 1) | 0x01010101;
|
||||
//aux32[1] = ((__vsub4(aux32[1], 0x01010101) >> 1) << 1) | 0x01010101;
|
||||
aux32[0] = ((aux32[0] >> 1) << 1) | 0x01010101;
|
||||
aux32[1] = ((aux32[1] >> 1) << 1) | 0x01010101;
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
#endif
|
||||
uint32_t signs0 = __vcmpeq4(((signs & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
uint32_t signs1 = __vcmpeq4(((signs >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
aux32[0] = __vsub4(aux32[0] ^ signs0, signs0);
|
||||
|
@ -2410,9 +2414,15 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
|||
y[j] = d * grid[j];
|
||||
}
|
||||
#else
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (2*((grid[j]-1)/2) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
|
@ -5227,16 +5237,15 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
|||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
int sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
//aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
//aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
//aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
//aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
#ifdef IQ3S_SLOW_MULT
|
||||
aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
|
||||
//aux32[0] = ((__vsub4(aux32[0], 0x01010101) >> 1) << 1) | 0x01010101;
|
||||
//aux32[1] = ((__vsub4(aux32[1], 0x01010101) >> 1) << 1) | 0x01010101;
|
||||
aux32[0] = ((aux32[0] >> 1) << 1) | 0x01010101;
|
||||
aux32[1] = ((aux32[1] >> 1) << 1) | 0x01010101;
|
||||
aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
|
||||
#else
|
||||
aux32[0] = (((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
aux32[1] = (((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
|
||||
#endif
|
||||
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const int grid_l = __vsub4(aux32[0] ^ signs0, signs0);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue