Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types
This commit is contained in:
parent
cbbfd69f42
commit
356464454b
4 changed files with 98 additions and 27 deletions
110
ggml-aarch64.c
110
ggml-aarch64.c
|
@ -21,19 +21,19 @@
|
|||
|
||||
// Functions to create the interleaved data layout formats
|
||||
|
||||
// interleave 4 block_q4_0s in blocks of block_len
|
||||
// interleave 4 block_q4_0s in blocks of interleave_blcksize
|
||||
// returns an interleaved block_q4_0x4
|
||||
// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
|
||||
// first, then interleave quants from 4 block_q4_0s in blocks of block_len
|
||||
// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize
|
||||
//
|
||||
// - in : an array of block_q4_0 pointers
|
||||
// - block_len : the block_q4_0 quants bytes are interleaved in blocks of
|
||||
// block_len bytes
|
||||
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
|
||||
// from bias offset form to pure sign form (this saves subtract
|
||||
// operations durin unpacking)
|
||||
// - in : an array of block_q4_0 pointers
|
||||
// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of
|
||||
// interleave_blcksize bytes
|
||||
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
|
||||
// from bias offset form to pure sign form (this saves subtract
|
||||
// operations durin unpacking)
|
||||
//
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
|
@ -41,9 +41,9 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
|
|||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 2; i++) {
|
||||
int src_offset = (i / (4 * block_len)) * block_len;
|
||||
int src_id = (i % (4 * block_len)) / block_len;
|
||||
src_offset += (i % block_len);
|
||||
int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (i % interleave_blcksize);
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
}
|
||||
|
@ -51,11 +51,11 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, u
|
|||
return out;
|
||||
}
|
||||
|
||||
// interleave 8 block_q4_0s in blocks of block_len
|
||||
// interleave 8 block_q4_0s in blocks of interleave_blcksize
|
||||
// returns an interleaved block_q4_0x8
|
||||
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of block_len
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) {
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
|
||||
block_q4_0x8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
|
@ -63,9 +63,9 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, u
|
|||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 4; i++) {
|
||||
int src_offset = (i / (8 * block_len)) * block_len;
|
||||
int src_id = (i % (8 * block_len)) / block_len;
|
||||
src_offset += (i % block_len);
|
||||
int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (i % interleave_blcksize);
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
}
|
||||
|
@ -135,7 +135,35 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
|
|||
}
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
// scalar
|
||||
const int interleave_blcksize = 4;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (j % interleave_blcksize);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -225,11 +253,47 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
|
|||
}
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
// scalar
|
||||
const int interleave_blcksize = 8;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (j % interleave_blcksize);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blocklen_per_row) {
|
||||
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) {
|
||||
assert(nrow == 4);
|
||||
UNUSED(nrow);
|
||||
if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row);
|
||||
else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row);
|
||||
else assert(false);
|
||||
}
|
||||
|
||||
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) {
|
||||
assert(n_per_row % QK4_0 == 0);
|
||||
const int nb = n_per_row / QK4_0;
|
||||
|
||||
|
@ -251,11 +315,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
|
|||
}
|
||||
|
||||
if (nrows_interleaved == 8) {
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88);
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88);
|
||||
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
||||
}
|
||||
else if (nrows_interleaved == 4) {
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88);
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88);
|
||||
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@ extern "C" {
|
|||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
|
|
@ -2412,7 +2412,8 @@ extern "C" {
|
|||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
|
||||
int64_t k, int64_t bx);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx,
|
||||
|
@ -2430,6 +2431,7 @@ extern "C" {
|
|||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously;
|
||||
int64_t ncols; // number of columns to process simultaneously;
|
||||
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
|
|
|
@ -702,11 +702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
.from_float_to_mat = quantize_q8_0_4x8,
|
||||
#else
|
||||
.nrows = 1,
|
||||
.from_float_to_mat = quantize_q8_0_4x4,
|
||||
#endif
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.type_name = "q8_1",
|
||||
|
@ -917,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
|
@ -932,6 +932,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
|
@ -947,6 +948,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
}
|
||||
|
@ -12207,6 +12209,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat
|
||||
= type_traits[vec_dot_type].from_float_to_mat;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
|
@ -12281,7 +12284,7 @@ UseGgmlGemm1:;
|
|||
int64_t i11_processed = 0;
|
||||
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
|
||||
for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
|
||||
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
||||
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize);
|
||||
wdata += row_size * 4;
|
||||
}
|
||||
i11_processed = ne11 - ne11 % 4;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue