Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions

This commit is contained in:
Dibakar Gope 2024-04-22 08:08:17 +00:00 committed by Dibakar Gope
parent 002e36eaec
commit 340ef07fca
7 changed files with 1941 additions and 740 deletions

View file

@ -46,6 +46,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_AARCH64", LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

View file

@ -384,6 +384,7 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_AARCH64 = 31,
GGML_TYPE_COUNT,
};
@ -425,6 +426,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_AARCH64 = 25, // except 1d tensors
};
// available tensor operations:
@ -603,11 +605,6 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
// char padding[4];
char padding[9];
void * rearranged_weight_gemv;
void * rearranged_weight_gemm;
bool weight_rearranged;
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -2397,6 +2394,7 @@ extern "C" {
GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_sve (void);
//
// Internal types and functions exposed for tests and benchmarks
@ -2412,6 +2410,9 @@ extern "C" {
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k, int n, int b);
typedef void (*ggml_gemv_t) (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
typedef void (*ggml_gemm_t) (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
typedef struct {
const char * type_name;
@ -2424,19 +2425,13 @@ extern "C" {
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
GGML_API void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur);
GGML_API void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur);
GGML_API void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur);
GGML_API void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur);
GGML_API void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur);
GGML_API void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur);
GGML_API void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur);
GGML_API void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -70,24 +70,6 @@ typedef struct {
} block_q4_0x8;
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
typedef struct {
ggml_fp16_t d[16]; // deltas for 16 q4_0 blocks
uint8_t qs[QK4_0 * 8]; // nibbles / quants for 16 q4_0 blocks
} block_q4_0x16;
static_assert(sizeof(block_q4_0x16) == 16 * sizeof(ggml_fp16_t) + QK4_0 * 8, "wrong q4_0x16 block size/padding");
typedef struct {
ggml_fp16_t d[64]; // deltas for 64 q4_0 blocks
uint8_t qs[QK4_0 * 32];// nibbles / quants for 64 q4_0 blocks
} block_q4_0x64;
static_assert(sizeof(block_q4_0x64) == 64 * sizeof(ggml_fp16_t) + QK4_0 * 32, "wrong q4_0x64 block size/padding");
typedef struct {
ggml_fp16_t d[2]; // deltas for 2 q8_0 blocks
int8_t qs[QK8_0 * 2]; // quants for 2 q8_0 blocks
} block_q8_0x2;
static_assert(sizeof(block_q8_0x2) == 2 * sizeof(ggml_fp16_t) + QK8_0 * 2, "wrong q8_0x2 block size/padding");
typedef struct {
ggml_fp16_t d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
@ -366,30 +348,34 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);
block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len);
block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len);
block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k, int nrows_interleaved, int blocklen_per_row);
// GEMV
void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
// GEMM
void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
#ifdef __cplusplus
}

View file

@ -486,192 +486,6 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif
void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur) {
block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q4_0x8 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK4_0;
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
const block_q4_0 * in_ptrs[8];
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
for (int i = 0; i < 7; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B = make_block_q4_0x8(in_ptrs, 4); // block_len=4 for SDOT
out_ptr_B++;
for (int i = 0; i < 8; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
}
void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur) {
#if defined(__ARM_FEATURE_SVE)
if (svcntw() != 8) {
printf("ggml_gemv_q4_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
exit(1);
}
block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q4_0x8 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK4_0;
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
const block_q4_0 * in_ptrs[8];
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
for (int i = 0; i < 7; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B = make_block_q4_0x8(in_ptrs, 4); // block_len=4 for SDOT
out_ptr_B++;
for (int i = 0; i < 8; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
#endif
}
#if defined(__ARM_FEATURE_SVE)
static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_sve;
#elif defined(__ARM_NEON)
static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_neon;
#endif
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q4_0_weights_for_gemv(cur); }
#endif
void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur) {
block_q4_0x4 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q4_0x4 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK4_0;
for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
const block_q4_0 * in_ptrs[4];
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 4 * nb);
for (int i = 0; i < 3; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B =
make_block_q4_0x4(in_ptrs, 8); // block_len=8 for SMMLA
out_ptr_B++;
for (int i = 0; i < 4; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
}
void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur) {
block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q8_0x8 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK8_0;
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
const block_q8_0 * in_ptrs[8];
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
for (int i = 0; i < 7; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B = make_block_q8_0x8(in_ptrs, 4); // block_len=4 for SDOT
out_ptr_B++;
for (int i = 0; i < 8; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
}
void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur) {
#if defined(__ARM_FEATURE_SVE)
if (svcntw() != 8) {
printf("ggml_gemv_q8_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
exit(1);
}
block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q8_0x8 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK8_0;
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
const block_q8_0 * in_ptrs[8];
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
for (int i = 0; i < 7; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B = make_block_q8_0x8(in_ptrs, 4); // block_len=4 for SDOT
out_ptr_B++;
for (int i = 0; i < 8; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
#endif
}
#if defined(__ARM_FEATURE_SVE)
static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_sve;
#elif defined(__ARM_NEON)
static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_neon;
#endif
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q8_0_weights_for_gemv(cur); }
#endif
void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur) {
block_q8_0x4 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
block_q8_0x4 * out_ptr_B_start = out_ptr_B;
int64_t nb = cur->ne[0] / QK8_0;
for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
const block_q8_0 * in_ptrs[4];
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 4 * nb);
for (int i = 0; i < 3; i++) {
in_ptrs[i + 1] = in_ptrs[i] + nb;
}
for (int64_t x = 0; x < nb; x++) {
*out_ptr_B =
make_block_q8_0x4(in_ptrs, 8); // block_len=8 for SMMLA
out_ptr_B++;
for (int i = 0; i < 4; i++) {
in_ptrs[i]++;
}
}
}
cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
}
//
// cross-platform UTF-8 file paths
//
@ -891,6 +705,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
#else
.nrows = 1,
#endif
.from_float_to_mat = quantize_row_q8_0_aarch64,
},
[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
@ -1088,6 +903,32 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16,
.nrows = 1,
},
[GGML_TYPE_Q4_0_AARCH64] = {
.type_name = "q4_0_aarch64",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
.from_float = quantize_row_q4_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
.vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
#else
.nrows = 1,
#endif
#if defined(__ARM_FEATURE_SVE)
.gemv = ggml_gemv_q4_0_q8_0_aarch64_sve256,
.gemm = ggml_gemm_q4_0_q8_0_aarch64_sve256,
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
.gemv = ggml_gemv_q4_0_q8_0_aarch64_neon,
.gemm = ggml_gemm_q4_0_q8_0_aarch64_neon,
#elif defined(__ARM_NEON)
.gemv = ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm,
.gemm = ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm,
#endif
}
};
@ -2804,10 +2645,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
*s = idx;
}
static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
//
// data types
//
@ -3391,6 +3228,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_Q4_0_AARCH64: wtype = GGML_TYPE_Q4_0_AARCH64; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
@ -3850,9 +3688,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.name =*/ { 0 },
/*.extra =*/ NULL,
///*.padding =*/ { 0 },
/*.rearranged_weight_gemv =*/ NULL,
/*.rearranged_weight_gemm =*/ NULL,
/*.weight_rearranged =*/ false,
};
#ifdef __clang__
@ -9638,6 +9473,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
{
ggml_compute_forward_add_q_f32(params, dst);
} break;
@ -10013,6 +9849,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
{
ggml_compute_forward_add1_q_f32(params, dst);
} break;
@ -10138,6 +9975,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
default:
{
GGML_ASSERT(false);
@ -12340,6 +12178,9 @@ static void ggml_compute_forward_mul_mat(
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const vec_dot_num_rows = type_traits[type].nrows;
ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
ggml_gemv_t const gemv = type_traits[type].gemv;
ggml_gemm_t const gemm = type_traits[type].gemm;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
@ -12405,10 +12246,9 @@ UseGgmlGemm1:;
}
}
}
#if defined(__ARM_FEATURE_MATMUL_INT8)
if ((src0->weight_rearranged == true) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
if ((type == GGML_TYPE_Q4_0_AARCH64) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
quantize_row_q8_0_and_make_block_q8_0x4((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4);
from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
wdata += row_size * 4;
}
for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
@ -12416,10 +12256,7 @@ UseGgmlGemm1:;
wdata += row_size;
}
}
#endif
#if defined(__ARM_FEATURE_MATMUL_INT8)
else {
#endif
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
@ -12428,9 +12265,7 @@ UseGgmlGemm1:;
}
}
}
#if defined(__ARM_FEATURE_MATMUL_INT8)
}
#endif
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
@ -12509,114 +12344,50 @@ UseGgmlGemm2:;
//if (ith == 0)
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
#if defined(__ARM_FEATURE_MATMUL_INT8) && (defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE))
if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (src0->weight_rearranged == true)) {
if (src0->type == GGML_TYPE_Q4_0) {
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
} else if (src0->type == GGML_TYPE_Q8_0) {
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
}
if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
gemv(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
}
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (src0->weight_rearranged == true)) {
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
// use batch-sized 16, 8, and 4 GEMM kernels
if (src0->type == GGML_TYPE_Q4_0) {
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
}
int rows_processed = (ne11 / 16) * 16;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
} else if (src0->type == GGML_TYPE_Q8_0) {
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
}
int rows_processed = (ne11 / 16) * 16;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
gemm(ne00, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
}
} else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (src0->weight_rearranged == true)) {
int rows_processed = (ne11 / 16) * 16;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
gemm(ne00, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
}
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
// use batch-sized 8, and 4 GEMM kernels
if (src0->type == GGML_TYPE_Q4_0) {
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
}
int rows_processed = (ne11 / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
} else if (src0->type == GGML_TYPE_Q8_0) {
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
}
int rows_processed = (ne11 / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
gemm(ne00, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
}
} else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (src0->weight_rearranged == true)) {
int rows_processed = (ne11 / 8) * 8;
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
}
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
}
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
// use batch-sized 4 GEMM kernel
if (src0->type == GGML_TYPE_Q4_0) {
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
}
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
} else if (src0->type == GGML_TYPE_Q8_0) {
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
}
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
gemm(ne00, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
}
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
}
#elif defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
if ((ggml_n_dims(src0) == 2) && (src0->weight_rearranged == true)) {
if (src0->type == GGML_TYPE_Q4_0) {
for (int row_iter = 0; row_iter < ne11; row_iter++) {
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
} else if (src0->type == GGML_TYPE_Q8_0) {
for (int row_iter = 0; row_iter < ne11; row_iter++) {
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
}
}
}
#endif
#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
else {
#endif
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
@ -12638,9 +12409,7 @@ UseGgmlGemm2:;
current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
}
#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
}
#endif
}
// ggml_compute_forward_mul_mat_id
@ -13051,6 +12820,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
{
ggml_compute_forward_out_prod_q_f32(params, dst);
} break;
@ -13236,6 +13006,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
default:
{
GGML_ASSERT(false);
@ -13495,6 +13266,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_AARCH64:
{
ggml_compute_forward_get_rows_q(params, dst);
} break;
@ -14081,6 +13853,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q4_0_AARCH64:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
@ -20804,6 +20577,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_AARCH64: result = quantize_q4_0_aarch64(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);
@ -22238,26 +22012,12 @@ int ggml_cpu_has_matmul_int8(void) {
#endif
}
int ggml_cpu_has_sve(void) {
#if defined(__ARM_FEATURE_SVE)
static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_sve;
#elif defined(__ARM_NEON)
static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_neon;
return 1;
#else
return 0;
#endif
#if defined(__ARM_FEATURE_SVE)
static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_sve;
#elif defined(__ARM_NEON)
static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_neon;
#endif
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
_ggml_gemv_q4_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
}
static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
_ggml_gemv_q8_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
}
#endif
////////////////////////////////////////////////////////////////////////////////

View file

@ -162,6 +162,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64 = 33, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};

View file

@ -3783,6 +3783,7 @@ struct llama_model_loader {
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q4_0_AARCH64: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -4359,32 +4360,6 @@ struct llama_model_loader {
}
}
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
if ((cur->type == GGML_TYPE_Q4_0) && (cur->ne[1] % 4 == 0)) {
cur->weight_rearranged = true;
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
rearrange_q4_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
#endif
#if defined(__ARM_FEATURE_MATMUL_INT8)
rearrange_q4_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
#endif
}
else if ((cur->type == GGML_TYPE_Q8_0) && (cur->ne[1] % 4 == 0)) {
cur->weight_rearranged = true;
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
rearrange_q8_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
#endif
#if defined(__ARM_FEATURE_MATMUL_INT8)
rearrange_q8_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
#endif
}
else {
cur->weight_rearranged = false;
}
#else
cur->weight_rearranged = false;
#endif
size_done += n_size;
}
@ -4502,6 +4477,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: return "Q4_0_AARCH64";
default: return "unknown, may not work";
}
@ -17787,6 +17763,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
else if (new_type == GGML_TYPE_Q4_0_AARCH64) {
new_type = GGML_TYPE_Q4_0;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@ -18099,6 +18078,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: default_type = GGML_TYPE_Q4_0_AARCH64; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
@ -18409,6 +18389,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data();
}
if (new_type == GGML_TYPE_Q4_0_AARCH64) {
if ((ggml_cpu_has_neon() == 0) && (ggml_cpu_has_sve() == 0)) new_type = GGML_TYPE_Q4_0;
if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
if (nthread > 1) nthread = 1;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
@ -21702,6 +21688,7 @@ const char * llama_print_system_info(void) {
#else
s += "LLAMAFILE = 0 | ";
#endif
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
return s.c_str();
}