Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types

This commit is contained in:
Dibakar Gope 2024-07-03 12:38:11 +00:00
parent cbbfd69f42
commit 356464454b
4 changed files with 98 additions and 27 deletions

View file

@ -21,19 +21,19 @@
// Functions to create the interleaved data layout formats
// interleave 4 block_q4_0s in blocks of block_len
// interleave 4 block_q4_0s in blocks of interleave_blcksize
// returns an interleaved block_q4_0x4
// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
// first, then interleave quants from 4 block_q4_0s in blocks of block_len
// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize
//
// - in : an array of block_q4_0 pointers
// - block_len : the block_q4_0 quants bytes are interleaved in blocks of
// block_len bytes
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
// from bias offset form to pure sign form (this saves subtract
// operations durin unpacking)
// - in : an array of block_q4_0 pointers
// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of
// interleave_blcksize bytes
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
// from bias offset form to pure sign form (this saves subtract
// operations durin unpacking)
//
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
block_q4_0x4 out;
for (int i = 0; i < 4; i++) {
@ -41,9 +41,9 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
}
for (int i = 0; i < QK4_0 * 2; i++) {
int src_offset = (i / (4 * block_len)) * block_len;
int src_id = (i % (4 * block_len)) / block_len;
src_offset += (i % block_len);
int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (i % interleave_blcksize);
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
}
@ -51,11 +51,11 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
return out;
}
// interleave 8 block_q4_0s in blocks of block_len
// interleave 8 block_q4_0s in blocks of interleave_blcksize
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of block_len
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
block_q4_0x8 out;
for (int i = 0; i < 8; i++) {
@ -63,9 +63,9 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, u
}
for (int i = 0; i < QK4_0 * 4; i++) {
int src_offset = (i / (8 * block_len)) * block_len;
int src_id = (i % (8 * block_len)) / block_len;
src_offset += (i % block_len);
int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize;
int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize;
src_offset += (i % interleave_blcksize);
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
}
@ -135,7 +135,35 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
}
}
#else
assert(false);
// scalar
const int interleave_blcksize = 4;
float srcv[4][QK8_0];
float id[4];
for (int i = 0; i < nb; i++) {
for (int row_iter = 0; row_iter < 4; row_iter++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < QK8_0; j++) {
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
amax = MAX(amax, fabsf(srcv[row_iter][j]));
}
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (j % interleave_blcksize);
float x0 = srcv[src_id][src_offset] * id[src_id];
y[i].qs[j] = roundf(x0);;
}
}
#endif
}
@ -225,11 +253,47 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
}
}
#else
assert(false);
// scalar
const int interleave_blcksize = 8;
float srcv[4][QK8_0];
float id[4];
for (int i = 0; i < nb; i++) {
for (int row_iter = 0; row_iter < 4; row_iter++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < QK8_0; j++) {
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
amax = MAX(amax, fabsf(srcv[row_iter][j]));
}
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (j % interleave_blcksize);
float x0 = srcv[src_id][src_offset] * id[src_id];
y[i].qs[j] = roundf(x0);;
}
}
#endif
}
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blocklen_per_row) {
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) {
assert(nrow == 4);
UNUSED(nrow);
if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row);
else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row);
else assert(false);
}
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) {
assert(n_per_row % QK4_0 == 0);
const int nb = n_per_row / QK4_0;
@ -251,11 +315,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
}
if (nrows_interleaved == 8) {
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88);
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88);
out_ptr = (block_q4_0x8 *) out_ptr + 1;
}
else if (nrows_interleaved == 4) {
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88);
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88);
out_ptr = (block_q4_0x4 *) out_ptr + 1;
}
}

View file

@ -16,6 +16,8 @@ extern "C" {
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

View file

@ -2412,7 +2412,8 @@ extern "C" {
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
int64_t k, int64_t bx);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
@ -2430,6 +2431,7 @@ extern "C" {
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
int64_t ncols; // number of columns to process simultaneously;
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_gemv_t gemv;
ggml_gemm_t gemm;

View file

@ -702,11 +702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
.from_float_to_mat = quantize_q8_0_4x8,
#else
.nrows = 1,
.from_float_to_mat = quantize_q8_0_4x4,
#endif
.from_float_to_mat = quantize_mat_q8_0,
},
[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
@ -917,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.interleave_blcksize = 4,
.gemv = ggml_gemv_q4_0_4x4_q8_0,
.gemm = ggml_gemm_q4_0_4x4_q8_0,
},
@ -932,6 +932,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_4x8_q8_0,
.gemm = ggml_gemm_q4_0_4x8_q8_0,
},
@ -947,6 +948,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 8,
.interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_8x8_q8_0,
.gemm = ggml_gemm_q4_0_8x8_q8_0,
}
@ -12207,6 +12209,7 @@ static void ggml_compute_forward_mul_mat(
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const vec_dot_num_rows = type_traits[type].nrows;
int64_t const matmul_num_cols = type_traits[type].ncols;
int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
ggml_from_float_to_mat_t const from_float_to_mat
= type_traits[vec_dot_type].from_float_to_mat;
ggml_gemv_t const gemv = type_traits[type].gemv;
@ -12281,7 +12284,7 @@ UseGgmlGemm1:;
int64_t i11_processed = 0;
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize);
wdata += row_size * 4;
}
i11_processed = ne11 - ne11 % 4;