diff --git a/ggml-aarch64.c b/ggml-aarch64.c index 28a92759f..f5b6ec896 100644 --- a/ggml-aarch64.c +++ b/ggml-aarch64.c @@ -21,19 +21,19 @@ // Functions to create the interleaved data layout formats -// interleave 4 block_q4_0s in blocks of block_len +// interleave 4 block_q4_0s in blocks of interleave_blcksize // returns an interleaved block_q4_0x4 // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks -// first, then interleave quants from 4 block_q4_0s in blocks of block_len +// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize // -// - in : an array of block_q4_0 pointers -// - block_len : the block_q4_0 quants bytes are interleaved in blocks of -// block_len bytes -// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes -// from bias offset form to pure sign form (this saves subtract -// operations durin unpacking) +// - in : an array of block_q4_0 pointers +// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of +// interleave_blcksize bytes +// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes +// from bias offset form to pure sign form (this saves subtract +// operations durin unpacking) // -static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) { +static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) { block_q4_0x4 out; for (int i = 0; i < 4; i++) { @@ -41,9 +41,9 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u } for (int i = 0; i < QK4_0 * 2; i++) { - int src_offset = (i / (4 * block_len)) * block_len; - int src_id = (i % (4 * block_len)) / block_len; - src_offset += (i % block_len); + int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (i % interleave_blcksize); out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; } @@ -51,11 +51,11 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u return out; } -// interleave 8 block_q4_0s in blocks of block_len +// interleave 8 block_q4_0s in blocks of interleave_blcksize // returns an interleaved block_q4_0x8 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks -// first, then interleave quants from 8 block_q4_0s in blocks of block_len -static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) { +// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize +static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) { block_q4_0x8 out; for (int i = 0; i < 8; i++) { @@ -63,9 +63,9 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, u } for (int i = 0; i < QK4_0 * 4; i++) { - int src_offset = (i / (8 * block_len)) * block_len; - int src_id = (i % (8 * block_len)) / block_len; - src_offset += (i % block_len); + int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize; + int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize; + src_offset += (i % interleave_blcksize); out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; } @@ -135,7 +135,35 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) } } #else - assert(false); + // scalar + const int interleave_blcksize = 4; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (j % interleave_blcksize); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0);; + } + } #endif } @@ -225,11 +253,47 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) } } #else - assert(false); + // scalar + const int interleave_blcksize = 8; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize; + int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize; + src_offset += (j % interleave_blcksize); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0);; + } + } #endif } -static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blocklen_per_row) { +void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) { + assert(nrow == 4); + UNUSED(nrow); + if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row); + else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row); + else assert(false); +} + +static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) { assert(n_per_row % QK4_0 == 0); const int nb = n_per_row / QK4_0; @@ -251,11 +315,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds } if (nrows_interleaved == 8) { - *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88); + *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88); out_ptr = (block_q4_0x8 *) out_ptr + 1; } else if (nrows_interleaved == 4) { - *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88); + *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88); out_ptr = (block_q4_0x4 *) out_ptr + 1; } } diff --git a/ggml-aarch64.h b/ggml-aarch64.h index 53f9d518d..65ead1efe 100644 --- a/ggml-aarch64.h +++ b/ggml-aarch64.h @@ -16,6 +16,8 @@ extern "C" { void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize); + // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 0c526c47e..0f663971d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2412,7 +2412,8 @@ extern "C" { typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); - typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, + int64_t k, int64_t bx); typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, @@ -2430,6 +2431,7 @@ extern "C" { enum ggml_type vec_dot_type; int64_t nrows; // number of rows to process simultaneously; int64_t ncols; // number of columns to process simultaneously; + int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize; ggml_from_float_to_mat_t from_float_to_mat; ggml_gemv_t gemv; ggml_gemm_t gemm; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index babebc7bb..6b5bdad16 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -702,11 +702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, #if defined (__ARM_FEATURE_MATMUL_INT8) .nrows = 2, - .from_float_to_mat = quantize_q8_0_4x8, #else .nrows = 1, - .from_float_to_mat = quantize_q8_0_4x4, #endif + .from_float_to_mat = quantize_mat_q8_0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -917,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, .ncols = 4, + .interleave_blcksize = 4, .gemv = ggml_gemv_q4_0_4x4_q8_0, .gemm = ggml_gemm_q4_0_4x4_q8_0, }, @@ -932,6 +932,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, .ncols = 4, + .interleave_blcksize = 8, .gemv = ggml_gemv_q4_0_4x8_q8_0, .gemm = ggml_gemm_q4_0_4x8_q8_0, }, @@ -947,6 +948,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, .ncols = 8, + .interleave_blcksize = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, } @@ -12207,6 +12209,7 @@ static void ggml_compute_forward_mul_mat( ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; int64_t const vec_dot_num_rows = type_traits[type].nrows; int64_t const matmul_num_cols = type_traits[type].ncols; + int64_t const interleave_blcksize = type_traits[type].interleave_blcksize; ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat; ggml_gemv_t const gemv = type_traits[type].gemv; @@ -12281,7 +12284,7 @@ UseGgmlGemm1:; int64_t i11_processed = 0; if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) { - from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize); wdata += row_size * 4; } i11_processed = ne11 - ne11 % 4;