Introduce Q4_0 and Q8_0 quantizations with BF16 delta values
This commit is contained in:
parent
4134999e01
commit
e26fd70dce
8 changed files with 1077 additions and 11 deletions
|
@ -346,6 +346,7 @@ extern "C" {
|
|||
|
||||
// google brain half-precision bfloat16
|
||||
typedef struct { uint16_t bits; } ggml_bf16_t;
|
||||
GGML_API ggml_bf16_t ggml_make_bf16(uint16_t val);
|
||||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||
|
@ -432,9 +433,14 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
<<<<<<< HEAD
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
=======
|
||||
GGML_FTYPE_MOSTLY_Q4_0_B16 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q8_0_B16 = 26, // except 1d tensors
|
||||
>>>>>>> ed837022 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values)
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
|
@ -20,11 +20,13 @@
|
|||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m128bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m128bh(p) (__m128bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
|
|
@ -699,6 +699,48 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
|||
quantize_row_q4_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q4_0_b16_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i*qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = (GGML_FP32_TO_BF16(d)).bits;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const float x0 = x[i*qk + 0 + j]*id;
|
||||
const float x1 = x[i*qk + qk/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
|
||||
|
||||
y[i].qs[j] = xi0;
|
||||
y[i].qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q4_0_b16(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q4_0_b16_reference(x, y, k);
|
||||
}
|
||||
|
||||
|
||||
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
const int qk = QK4_1;
|
||||
|
@ -1532,6 +1574,27 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0_b16(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d));
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
||||
const int x1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
|
||||
static const int qk = QK4_1;
|
||||
|
||||
|
@ -1622,6 +1685,24 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_0_b16(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
|
||||
static const int qk = QK8_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d));
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
|
@ -3132,6 +3213,34 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|||
}
|
||||
}
|
||||
|
||||
static void quantize_row_q4_0_b16_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_b16_reference(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
float weight[QK4_0];
|
||||
int8_t L[QK4_0];
|
||||
|
||||
float sum_x2 = 0;
|
||||
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||
float sigma2 = sum_x2/n_per_row;
|
||||
|
||||
const int64_t nb = n_per_row/QK4_0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float * xb = x + QK4_0 * ib;
|
||||
const float * qw = quant_weights + QK4_0 * ib;
|
||||
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
|
||||
y[ib].d = (GGML_FP32_TO_BF16(d)).bits;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
|
@ -3147,6 +3256,21 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
|
|||
return nrow * row_size;
|
||||
}
|
||||
|
||||
size_t quantize_q4_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_b16_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int64_t row = 0; row < nrow; ++row) {
|
||||
quantize_row_q4_0_b16_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||
|
||||
|
@ -3306,6 +3430,13 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
|||
return nrow * row_size;
|
||||
}
|
||||
|
||||
size_t quantize_q8_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0_B16, n_per_row);
|
||||
quantize_row_q8_0_b16_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
// ====================== "True" 2-bit (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
||||
|
@ -4208,6 +4339,278 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_0_b16_q8_0_b16(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q4_0 * restrict x = vx;
|
||||
const block_q8_0 * restrict y = vy;
|
||||
|
||||
// Initialize accumulator with zeros
|
||||
#if defined(__AVX512BF16__)
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
__m128 zerovec = _mm_setzero_ps();
|
||||
const __m256i off = _mm256_set1_epi8( 8 );
|
||||
int nbmod = nb - (nb % 4);
|
||||
for (int i = 0; i < nbmod; i+=4) {
|
||||
// Compute combined scale for set of four blocks
|
||||
uint64_t x_delta = ((uint64_t)x[i+3].d << 48) | ((uint64_t)x[i+2].d << 32) | ((uint64_t)x[i+1].d << 16) | (x[i].d);
|
||||
uint64_t y_delta = ((uint64_t)y[i+3].d << 48) | ((uint64_t)y[i+2].d << 32) | ((uint64_t)y[i+1].d << 16) | (y[i].d);
|
||||
|
||||
__m128bh xd = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, x_delta)));
|
||||
__m128bh yd = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, y_delta)));
|
||||
|
||||
__m256 d = _mm256_castps128_ps256(_mm_dpbf16_ps(zerovec, xd, yd));
|
||||
d = _mm256_permute2f128_ps(d ,d, 0);
|
||||
|
||||
__m256i qx0 = bytes_from_nibbles_32(x[i].qs);
|
||||
qx0 = _mm256_sub_epi8( qx0, off );
|
||||
__m256i qy0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q0 = mul_sum_i8_pairs_float(qx0, qy0);
|
||||
|
||||
__m256i qx1 = bytes_from_nibbles_32(x[i + 1].qs);
|
||||
qx1 = _mm256_sub_epi8( qx1, off );
|
||||
__m256i qy1 = _mm256_loadu_si256((const __m256i *)y[i + 1].qs);
|
||||
|
||||
const __m256 q1 = mul_sum_i8_pairs_float(qx1, qy1);
|
||||
|
||||
__m256i qx2 = bytes_from_nibbles_32(x[i + 2].qs);
|
||||
qx2 = _mm256_sub_epi8( qx2, off );
|
||||
__m256i qy2 = _mm256_loadu_si256((const __m256i *)y[i + 2].qs);
|
||||
|
||||
const __m256 q2 = mul_sum_i8_pairs_float(qx2, qy2);
|
||||
|
||||
__m256i qx3 = bytes_from_nibbles_32(x[i + 3].qs);
|
||||
qx3 = _mm256_sub_epi8( qx3, off );
|
||||
__m256i qy3 = _mm256_loadu_si256((const __m256i *)y[i + 3].qs);
|
||||
|
||||
const __m256 q3 = mul_sum_i8_pairs_float(qx3, qy3);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 0), q0, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 85), q1, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 170), q2, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 255), q3, acc );
|
||||
}
|
||||
for(int i = nbmod; i < nb; i++) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)) );
|
||||
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||
qx = _mm256_sub_epi8( qx, off );
|
||||
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
}
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)) );
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||
|
||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||
const __m256i off = _mm256_set1_epi8( 8 );
|
||||
qx = _mm256_sub_epi8( qx, off );
|
||||
|
||||
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
/* Multiply q with scale and accumulate */
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
}
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)));
|
||||
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
const __m128i off = _mm_set1_epi8(8);
|
||||
|
||||
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
||||
|
||||
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
||||
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
||||
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
||||
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__SSSE3__)
|
||||
// set constants
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
const __m128i off = _mm_set1_epi8(8);
|
||||
|
||||
// Initialize accumulator with zeros
|
||||
__m128 acc_0 = _mm_setzero_ps();
|
||||
__m128 acc_1 = _mm_setzero_ps();
|
||||
__m128 acc_2 = _mm_setzero_ps();
|
||||
__m128 acc_3 = _mm_setzero_ps();
|
||||
|
||||
// First round without accumulation
|
||||
{
|
||||
_mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = _mm_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)));
|
||||
|
||||
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
||||
|
||||
__m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
|
||||
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
__m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
|
||||
__m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
|
||||
bx_1 = _mm_sub_epi8(bx_1, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
||||
|
||||
_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = _mm_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[1].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[1].d)) );
|
||||
|
||||
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
||||
|
||||
__m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
|
||||
__m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
||||
bx_2 = _mm_sub_epi8(bx_2, off);
|
||||
const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
|
||||
|
||||
__m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
|
||||
__m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
|
||||
bx_3 = _mm_sub_epi8(bx_3, off);
|
||||
const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
||||
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
||||
__m128 p2 = _mm_cvtepi32_ps(i32_2);
|
||||
__m128 p3 = _mm_cvtepi32_ps(i32_3);
|
||||
|
||||
// Apply the scale
|
||||
acc_0 = _mm_mul_ps( d_0_1, p0 );
|
||||
acc_1 = _mm_mul_ps( d_0_1, p1 );
|
||||
acc_2 = _mm_mul_ps( d_2_3, p2 );
|
||||
acc_3 = _mm_mul_ps( d_2_3, p3 );
|
||||
}
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
// Main loop
|
||||
for (int i = 2; i < nb; i+=2) {
|
||||
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = _mm_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)) );
|
||||
|
||||
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
||||
|
||||
__m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
|
||||
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
__m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
|
||||
__m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
||||
bx_1 = _mm_sub_epi8(bx_1, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
||||
|
||||
_mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = _mm_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i + 1].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i + 1].d)) );
|
||||
|
||||
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
||||
|
||||
__m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
|
||||
__m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
|
||||
bx_2 = _mm_sub_epi8(bx_2, off);
|
||||
const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
|
||||
|
||||
__m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
|
||||
__m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
|
||||
bx_3 = _mm_sub_epi8(bx_3, off);
|
||||
const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
||||
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
||||
__m128 p2 = _mm_cvtepi32_ps(i32_2);
|
||||
__m128 p3 = _mm_cvtepi32_ps(i32_3);
|
||||
|
||||
// Apply the scale
|
||||
__m128 p0_d = _mm_mul_ps( d_0_1, p0 );
|
||||
__m128 p1_d = _mm_mul_ps( d_0_1, p1 );
|
||||
__m128 p2_d = _mm_mul_ps( d_2_3, p2 );
|
||||
__m128 p3_d = _mm_mul_ps( d_2_3, p3 );
|
||||
|
||||
// Acummulate
|
||||
acc_0 = _mm_add_ps(p0_d, acc_0);
|
||||
acc_1 = _mm_add_ps(p1_d, acc_1);
|
||||
acc_2 = _mm_add_ps(p2_d, acc_2);
|
||||
acc_3 = _mm_add_ps(p3_d, acc_3);
|
||||
}
|
||||
|
||||
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#else
|
||||
// scalar
|
||||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[i].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
sumf += sumi*GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d))*GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
@ -5470,6 +5873,115 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_b16_q8_0_b16(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q8_0 * restrict x = vx;
|
||||
const block_q8_0 * restrict y = vy;
|
||||
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
__m128 zerovec = _mm_setzero_ps();
|
||||
int nbmod = nb - (nb % 4);
|
||||
for (int i = 0; i < nbmod; i+=4) {
|
||||
// Compute combined scale for set of four blocks
|
||||
|
||||
uint64_t x_delta = ((uint64_t)x[i+3].d << 48) | ((uint64_t)x[i+2].d << 32) | ((uint64_t)x[i+1].d << 16) | (x[i].d);
|
||||
uint64_t y_delta = ((uint64_t)y[i+3].d << 48) | ((uint64_t)y[i+2].d << 32) | ((uint64_t)y[i+1].d << 16) | (y[i].d);
|
||||
|
||||
__m128bh xd = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, x_delta)));
|
||||
__m128bh yd = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, y_delta)));
|
||||
|
||||
__m256 d = _mm256_castps128_ps256(_mm_dpbf16_ps(zerovec, xd, yd));
|
||||
d = _mm256_permute2f128_ps(d ,d, 0);
|
||||
|
||||
__m256i qx0 = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||
__m256i qy0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q0 = mul_sum_i8_pairs_float(qx0, qy0);
|
||||
|
||||
__m256i qx1 = _mm256_loadu_si256((const __m256i *)x[i + 1].qs);
|
||||
__m256i qy1 = _mm256_loadu_si256((const __m256i *)y[i + 1].qs);
|
||||
|
||||
const __m256 q1 = mul_sum_i8_pairs_float(qx1, qy1);
|
||||
|
||||
__m256i qx2 = _mm256_loadu_si256((const __m256i *)x[i + 2].qs);
|
||||
__m256i qy2 = _mm256_loadu_si256((const __m256i *)y[i + 2].qs);
|
||||
|
||||
const __m256 q2 = mul_sum_i8_pairs_float(qx2, qy2);
|
||||
|
||||
__m256i qx3 = _mm256_loadu_si256((const __m256i *)x[i + 3].qs);
|
||||
__m256i qy3 = _mm256_loadu_si256((const __m256i *)y[i + 3].qs);
|
||||
|
||||
const __m256 q3 = mul_sum_i8_pairs_float(qx3, qy3);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 0), q0, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 85), q1, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 170), q2, acc );
|
||||
acc = _mm256_fmadd_ps( _mm256_shuffle_ps(d, d, 255), q3, acc );
|
||||
}
|
||||
for(int i = nbmod; i < nb; i++) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)) );
|
||||
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
// Main loop
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps( GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d)) * GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)) );
|
||||
|
||||
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#else
|
||||
// scalar
|
||||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[i].qs[j]*y[i].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_BF16_TO_FP32(ggml_make_bf16(x[i].d))*GGML_BF16_TO_FP32(ggml_make_bf16(y[i].d)));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if QK_K == 256
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
@ -14602,6 +15114,28 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool isinf_bf16(ggml_half f) {
|
||||
return (f & 0x7fff) == 0x7f80;
|
||||
}
|
||||
|
||||
static bool isnan_bf16(ggml_half f) {
|
||||
return (f & 0x7fff) > 0x7f80;
|
||||
}
|
||||
|
||||
static bool validate_bf16(ggml_half f, size_t i) {
|
||||
if (isinf_fp16(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isnan_fp16(f)) {
|
||||
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
|
@ -14618,6 +15152,7 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
|||
} \
|
||||
}
|
||||
|
||||
<<<<<<< HEAD
|
||||
#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
|
@ -14625,6 +15160,13 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
|||
if (!validate_fp16(q[i].d[j], i)) { \
|
||||
return false; \
|
||||
} \
|
||||
=======
|
||||
#define VALIDATE_ROW_DATA_D_B16_IMPL(type, data, nb) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
if (!validate_bf16((q[i].d), i)) { \
|
||||
return false; \
|
||||
>>>>>>> ed837022 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values)
|
||||
} \
|
||||
}
|
||||
|
||||
|
@ -14755,6 +15297,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_B16_IMPL(block_q4_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
|
||||
|
@ -14771,6 +15317,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_B16_IMPL(block_q8_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
|
||||
|
|
|
@ -33,10 +33,12 @@ void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGM
|
|||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_0_b16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_b16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
@ -54,10 +56,12 @@ void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
|||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_0_b16(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_0_b16(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
@ -79,10 +83,12 @@ void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_
|
|||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_0_b16_q8_0_b16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_b16_q8_0_b16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
@ -117,10 +123,12 @@ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
|||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_b16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0_b16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
void iq2xs_init_impl(enum ggml_type type);
|
||||
void iq2xs_free_impl(enum ggml_type type);
|
||||
|
|
|
@ -428,6 +428,12 @@ float ggml_bf16_to_fp32(ggml_bf16_t x) {
|
|||
return GGML_BF16_TO_FP32(x); // it just left shifts
|
||||
}
|
||||
|
||||
ggml_bf16_t ggml_make_bf16(uint16_t x) {
|
||||
ggml_bf16_t bf16_value;
|
||||
bf16_value.bits = x;
|
||||
return bf16_value;
|
||||
}
|
||||
|
||||
ggml_bf16_t ggml_fp32_to_bf16(float x) {
|
||||
#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
|
||||
return GGML_FP32_TO_BF16(x);
|
||||
|
@ -3304,6 +3310,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
enum ggml_type wtype = GGML_TYPE_COUNT;
|
||||
|
||||
switch (ftype) {
|
||||
<<<<<<< HEAD
|
||||
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
||||
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
||||
case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
|
||||
|
@ -3331,6 +3338,34 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
=======
|
||||
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
||||
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
||||
case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_B16: wtype = GGML_TYPE_Q4_0_B16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0_B16: wtype = GGML_TYPE_Q8_0_B16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
>>>>>>> ed837022 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values)
|
||||
}
|
||||
|
||||
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
||||
|
@ -9561,10 +9596,12 @@ static void ggml_compute_forward_add(
|
|||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
@ -9938,10 +9975,12 @@ static void ggml_compute_forward_add1(
|
|||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
@ -10066,10 +10105,12 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
@ -12909,10 +12950,12 @@ static void ggml_compute_forward_out_prod(
|
|||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
@ -13096,10 +13139,12 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
@ -13358,10 +13403,12 @@ static void ggml_compute_forward_get_rows(
|
|||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
@ -13947,10 +13994,12 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_0_B16:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_0_B16:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
@ -20645,10 +20694,12 @@ size_t ggml_quantize_chunk(
|
|||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_B16: result = quantize_q4_0_b16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_0_B16: result = quantize_q8_0_b16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
|
|
@ -72,6 +72,10 @@ inline float unhalf(ggml_fp16_t d) {
|
|||
return GGML_FP16_TO_FP32(d);
|
||||
}
|
||||
|
||||
inline float bf16_unhalf(ggml_bf16_t d) {
|
||||
return GGML_BF16_TO_FP32(d);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED ARITHMETIC OPERATIONS
|
||||
|
||||
|
@ -813,6 +817,382 @@ class tinyBLAS_Q0_AVX {
|
|||
};
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_Q0_B16_AVX {
|
||||
public:
|
||||
tinyBLAS_Q0_B16_AVX(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n, int task) {
|
||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
|
||||
#if VECTOR_REGISTERS == 32
|
||||
case 0x44:
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemm4xN<4>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemm4xN<3>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemmMx4<3>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<3, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemm4xN<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemmMx4<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
#else
|
||||
case 0x44:
|
||||
case 0x43:
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemm4xN<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x34:
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemmMx4<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x33:
|
||||
#endif
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemm4xN<1>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 1>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
#if defined(__AVX512BF16__)
|
||||
gemmMx4<1>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<1, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
template <int RN>
|
||||
NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / 4;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * 4;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
__m256 Cv[RN][4] = {};
|
||||
__m128 zerovec = _mm_setzero_ps();
|
||||
for (int64_t l = 0; l < k; ++l) {
|
||||
uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
|
||||
__m128bh da = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, a_delta)));
|
||||
__m256i avec0 = load(A + lda * (ii + 0) + l);
|
||||
__m256i avec1 = load(A + lda * (ii + 1) + l);
|
||||
__m256i avec2 = load(A + lda * (ii + 2) + l);
|
||||
__m256i avec3 = load(A + lda * (ii + 3) + l);
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
__m128bh db = m128bh(_mm_set1_epi16(B[ldb * (jj + j) + l].d));
|
||||
__m256 dvec = _mm256_castps128_ps256(_mm_dpbf16_ps(zerovec, da, db));
|
||||
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
|
||||
Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
|
||||
updot(_mm256_sign_epi8(avec0, avec0),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
|
||||
Cv[j][0]);
|
||||
Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
|
||||
updot(_mm256_sign_epi8(avec1, avec1),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
|
||||
Cv[j][1]);
|
||||
Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
|
||||
updot(_mm256_sign_epi8(avec2, avec2),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
|
||||
Cv[j][2]);
|
||||
Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
|
||||
updot(_mm256_sign_epi8(avec3, avec3),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
|
||||
Cv[j][3]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < 4; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM>
|
||||
NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / 4;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * 4;
|
||||
__m256 Cv[4][RM] = {};
|
||||
__m128 zerovec = _mm_setzero_ps();
|
||||
for (int64_t l = 0; l < k; ++l) {
|
||||
uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
|
||||
__m128bh db = m128bh(_mm_cvtepu16_epi32(_mm_set_epi64x(0, b_delta)));
|
||||
__m256i bvec0 = load(B + ldb * (jj + 0) + l);
|
||||
__m256i bvec1 = load(B + ldb * (jj + 1) + l);
|
||||
__m256i bvec2 = load(B + ldb * (jj + 2) + l);
|
||||
__m256i bvec3 = load(B + ldb * (jj + 3) + l);
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
__m128bh da = m128bh(_mm_set1_epi16((A[lda * (ii + i) + l].d)));
|
||||
__m256 dvec = _mm256_castps128_ps256(_mm_dpbf16_ps(zerovec, da, db));
|
||||
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
|
||||
Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
|
||||
Cv[0][i]);
|
||||
Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
|
||||
Cv[1][i]);
|
||||
Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
|
||||
Cv[2][i]);
|
||||
Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
|
||||
Cv[3][i]);
|
||||
}
|
||||
}
|
||||
for (int64_t j = 0; j < 4; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
__m256 Cv[RN][RM] = {};
|
||||
for (int64_t l = 0; l < k; ++l)
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
#if defined(__AVX2__)
|
||||
__m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
||||
load(A + lda * (ii + i) + l)));
|
||||
#else
|
||||
__m128i ali0 = load0(A + lda * (ii + i) + l);
|
||||
__m128i ali1 = load1(A + lda * (ii + i) + l);
|
||||
__m128i blj0 = load0(B + ldb * (jj + j) + l);
|
||||
__m128i blj1 = load1(B + ldb * (jj + j) + l);
|
||||
|
||||
__m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
|
||||
__m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
|
||||
__m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
|
||||
__m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
|
||||
|
||||
// updot
|
||||
const __m128i oneFill = _mm_set1_epi16(1);
|
||||
__m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
|
||||
__m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
|
||||
__m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
|
||||
#endif
|
||||
|
||||
Cv[j][i] = madd(_mm256_set1_ps(bf16_unhalf(ggml_make_bf16(A[lda * (ii + i) + l].d)) *
|
||||
bf16_unhalf(ggml_make_bf16(B[ldb * (jj + j) + l].d))),
|
||||
udTmp,
|
||||
Cv[j][i]);
|
||||
}
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline __m256i load(const block_q8_0 *b) {
|
||||
return _mm256_loadu_si256((const __m256i *)b->qs);
|
||||
}
|
||||
|
||||
inline __m128i load0(const block_q8_0 *b) {
|
||||
return _mm_loadu_si128((const __m128i *)b->qs);
|
||||
}
|
||||
|
||||
inline __m128i load1(const block_q8_0 *b) {
|
||||
return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
|
||||
}
|
||||
|
||||
inline __m256i load(const block_q4_0 *b) {
|
||||
return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
|
||||
}
|
||||
|
||||
inline __m128i load0(const block_q4_0 *b) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
|
||||
}
|
||||
|
||||
inline __m128i load1(const block_q4_0 *b) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
||||
}
|
||||
|
||||
inline __m256 updot(__m256i u, __m256i s) {
|
||||
__m256i res;
|
||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||
res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
|
||||
#else
|
||||
res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
|
||||
#endif
|
||||
return _mm256_cvtepi32_ps(res);
|
||||
}
|
||||
|
||||
static inline __m256i denibble(const uint8_t *p) {
|
||||
__m128i x = _mm_loadu_si128((const __m128i *)p);
|
||||
return _mm256_and_si256(_mm256_set1_epi8(15),
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(x),
|
||||
_mm_srli_epi16(x, 4), 1));
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *const C;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
#endif // __AVX2__
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
|
@ -1006,6 +1386,38 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q8_0_B16: {
|
||||
if (Btype != GGML_TYPE_Q8_0_B16)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
tinyBLAS_Q0_B16_AVX<block_q8_0, block_q8_0, float> tb{
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q4_0_B16: {
|
||||
if (Btype != GGML_TYPE_Q8_0_B16)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
tinyBLAS_Q0_B16_AVX<block_q4_0, block_q8_0, float> tb{
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -1254,6 +1254,7 @@ class GGUFValueType(IntEnum):
|
|||
# Items here are (block size, type size)
|
||||
QK_K = 256
|
||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
<<<<<<< HEAD
|
||||
GGMLQuantizationType.F32: (1, 4),
|
||||
GGMLQuantizationType.F16: (1, 2),
|
||||
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
||||
|
@ -1286,6 +1287,39 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||
=======
|
||||
GGMLQuantizationType.F32: (1, 4),
|
||||
GGMLQuantizationType.F16: (1, 2),
|
||||
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
||||
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
||||
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
||||
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
||||
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
||||
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
||||
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
||||
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
||||
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
||||
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
||||
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
||||
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
||||
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
||||
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
||||
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
||||
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
||||
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
||||
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
||||
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
||||
GGMLQuantizationType.I8: (1, 1),
|
||||
GGMLQuantizationType.I16: (1, 2),
|
||||
GGMLQuantizationType.I32: (1, 4),
|
||||
GGMLQuantizationType.I64: (1, 8),
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||
GGMLQuantizationType.BF16: (1, 2),
|
||||
GGMLQuantizationType.Q4_0_B16: (32, 2 + 16),
|
||||
GGMLQuantizationType.Q8_0_B16: (32, 2 + 32),
|
||||
>>>>>>> 2f13a1e6 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values)
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -7757,6 +7757,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0_B16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
)) {
|
||||
|
@ -15479,7 +15480,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
else if ((new_type != GGML_TYPE_Q8_0) && (new_type != GGML_TYPE_Q8_0_B16)) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
|
@ -15620,12 +15621,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_B16)
|
||||
&& qs.has_imatrix && i_layer < n_layer/8) {
|
||||
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
||||
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
||||
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
||||
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
||||
new_type = ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0) || (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_B16)) ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
||||
}
|
||||
++qs.i_ffn_down;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
|
@ -15782,10 +15783,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
switch (params->ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_B16: default_type = GGML_TYPE_Q4_0_B16; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0_B16: default_type = GGML_TYPE_Q8_0_B16; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue