Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions
This commit is contained in:
parent
002e36eaec
commit
340ef07fca
7 changed files with 1941 additions and 740 deletions
|
@ -46,6 +46,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||||
|
{ "Q4_0_AARCH64", LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
|
|
|
@ -384,6 +384,7 @@ extern "C" {
|
||||||
GGML_TYPE_F64 = 28,
|
GGML_TYPE_F64 = 28,
|
||||||
GGML_TYPE_IQ1_M = 29,
|
GGML_TYPE_IQ1_M = 29,
|
||||||
GGML_TYPE_BF16 = 30,
|
GGML_TYPE_BF16 = 30,
|
||||||
|
GGML_TYPE_Q4_0_AARCH64 = 31,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -425,6 +426,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_0_AARCH64 = 25, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
@ -603,11 +605,6 @@ extern "C" {
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
// char padding[4];
|
// char padding[4];
|
||||||
char padding[9];
|
|
||||||
|
|
||||||
void * rearranged_weight_gemv;
|
|
||||||
void * rearranged_weight_gemm;
|
|
||||||
bool weight_rearranged;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -2397,6 +2394,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_rpc (void);
|
GGML_API int ggml_cpu_has_rpc (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
GGML_API int ggml_cpu_has_sve (void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
@ -2412,6 +2410,9 @@ extern "C" {
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k, int n, int b);
|
||||||
|
typedef void (*ggml_gemv_t) (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
typedef void (*ggml_gemm_t) (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
|
@ -2424,19 +2425,13 @@ extern "C" {
|
||||||
ggml_vec_dot_t vec_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
int64_t nrows; // number of rows to process simultaneously;
|
int64_t nrows; // number of rows to process simultaneously;
|
||||||
|
ggml_from_float_to_mat_t from_float_to_mat;
|
||||||
|
ggml_gemv_t gemv;
|
||||||
|
ggml_gemm_t gemm;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
GGML_API void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur);
|
|
||||||
GGML_API void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -70,24 +70,6 @@ typedef struct {
|
||||||
} block_q4_0x8;
|
} block_q4_0x8;
|
||||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
ggml_fp16_t d[16]; // deltas for 16 q4_0 blocks
|
|
||||||
uint8_t qs[QK4_0 * 8]; // nibbles / quants for 16 q4_0 blocks
|
|
||||||
} block_q4_0x16;
|
|
||||||
static_assert(sizeof(block_q4_0x16) == 16 * sizeof(ggml_fp16_t) + QK4_0 * 8, "wrong q4_0x16 block size/padding");
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
ggml_fp16_t d[64]; // deltas for 64 q4_0 blocks
|
|
||||||
uint8_t qs[QK4_0 * 32];// nibbles / quants for 64 q4_0 blocks
|
|
||||||
} block_q4_0x64;
|
|
||||||
static_assert(sizeof(block_q4_0x64) == 64 * sizeof(ggml_fp16_t) + QK4_0 * 32, "wrong q4_0x64 block size/padding");
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
ggml_fp16_t d[2]; // deltas for 2 q8_0 blocks
|
|
||||||
int8_t qs[QK8_0 * 2]; // quants for 2 q8_0 blocks
|
|
||||||
} block_q8_0x2;
|
|
||||||
static_assert(sizeof(block_q8_0x2) == 2 * sizeof(ggml_fp16_t) + QK8_0 * 2, "wrong q8_0x2 block size/padding");
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d[4]; // deltas for 4 q8_0 blocks
|
ggml_fp16_t d[4]; // deltas for 4 q8_0 blocks
|
||||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||||
|
@ -366,30 +348,34 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
void iq3xs_init_impl(int grid_size);
|
void iq3xs_init_impl(int grid_size);
|
||||||
void iq3xs_free_impl(int grid_size);
|
void iq3xs_free_impl(int grid_size);
|
||||||
|
|
||||||
block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len);
|
block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
|
||||||
block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len);
|
block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
|
||||||
block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
|
block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
|
||||||
block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
|
block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
|
||||||
void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
|
void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k, int nrows_interleaved, int blocklen_per_row);
|
||||||
void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
|
|
||||||
|
|
||||||
// GEMV
|
// GEMV
|
||||||
void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
|
||||||
// GEMM
|
// GEMM
|
||||||
void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
400
ggml/src/ggml.c
400
ggml/src/ggml.c
|
@ -486,192 +486,6 @@ int64_t ggml_cycles_per_ms(void) {
|
||||||
#define ggml_perf_cycles_per_ms() 0
|
#define ggml_perf_cycles_per_ms() 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur) {
|
|
||||||
block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q4_0x8 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK4_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
|
|
||||||
const block_q4_0 * in_ptrs[8];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
|
|
||||||
for (int i = 0; i < 7; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B = make_block_q4_0x8(in_ptrs, 4); // block_len=4 for SDOT
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur) {
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
if (svcntw() != 8) {
|
|
||||||
printf("ggml_gemv_q4_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q4_0x8 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK4_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
|
|
||||||
const block_q4_0 * in_ptrs[8];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
|
|
||||||
for (int i = 0; i < 7; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B = make_block_q4_0x8(in_ptrs, 4); // block_len=4 for SDOT
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_sve;
|
|
||||||
#elif defined(__ARM_NEON)
|
|
||||||
static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_neon;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q4_0_weights_for_gemv(cur); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur) {
|
|
||||||
block_q4_0x4 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q4_0x4 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK4_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
|
|
||||||
const block_q4_0 * in_ptrs[4];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 4 * nb);
|
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B =
|
|
||||||
make_block_q4_0x4(in_ptrs, 8); // block_len=8 for SMMLA
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur) {
|
|
||||||
block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q8_0x8 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK8_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
|
|
||||||
const block_q8_0 * in_ptrs[8];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
|
|
||||||
for (int i = 0; i < 7; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B = make_block_q8_0x8(in_ptrs, 4); // block_len=4 for SDOT
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur) {
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
if (svcntw() != 8) {
|
|
||||||
printf("ggml_gemv_q8_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q8_0x8 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK8_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
|
|
||||||
const block_q8_0 * in_ptrs[8];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
|
|
||||||
for (int i = 0; i < 7; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B = make_block_q8_0x8(in_ptrs, 4); // block_len=4 for SDOT
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_sve;
|
|
||||||
#elif defined(__ARM_NEON)
|
|
||||||
static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_neon;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q8_0_weights_for_gemv(cur); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur) {
|
|
||||||
block_q8_0x4 * out_ptr_B = malloc(ggml_nbytes(cur)); // B_blocked->data;
|
|
||||||
block_q8_0x4 * out_ptr_B_start = out_ptr_B;
|
|
||||||
int64_t nb = cur->ne[0] / QK8_0;
|
|
||||||
|
|
||||||
for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
|
|
||||||
const block_q8_0 * in_ptrs[4];
|
|
||||||
|
|
||||||
in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 4 * nb);
|
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
in_ptrs[i + 1] = in_ptrs[i] + nb;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t x = 0; x < nb; x++) {
|
|
||||||
*out_ptr_B =
|
|
||||||
make_block_q8_0x4(in_ptrs, 8); // block_len=8 for SMMLA
|
|
||||||
out_ptr_B++;
|
|
||||||
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
in_ptrs[i]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// cross-platform UTF-8 file paths
|
// cross-platform UTF-8 file paths
|
||||||
//
|
//
|
||||||
|
@ -891,6 +705,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
#else
|
#else
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
#endif
|
#endif
|
||||||
|
.from_float_to_mat = quantize_row_q8_0_aarch64,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
.type_name = "q8_1",
|
.type_name = "q8_1",
|
||||||
|
@ -1088,6 +903,32 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||||
.vec_dot_type = GGML_TYPE_BF16,
|
.vec_dot_type = GGML_TYPE_BF16,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
},
|
||||||
|
[GGML_TYPE_Q4_0_AARCH64] = {
|
||||||
|
.type_name = "q4_0_aarch64",
|
||||||
|
.blck_size = QK4_0,
|
||||||
|
.type_size = sizeof(block_q4_0),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
||||||
|
.from_float = quantize_row_q4_0,
|
||||||
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
.gemv = ggml_gemv_q4_0_q8_0_aarch64_sve256,
|
||||||
|
.gemm = ggml_gemm_q4_0_q8_0_aarch64_sve256,
|
||||||
|
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.gemv = ggml_gemv_q4_0_q8_0_aarch64_neon,
|
||||||
|
.gemm = ggml_gemm_q4_0_q8_0_aarch64_neon,
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
.gemv = ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm,
|
||||||
|
.gemm = ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm,
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2804,10 +2645,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
||||||
*s = idx;
|
*s = idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
|
||||||
|
|
||||||
static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// data types
|
// data types
|
||||||
//
|
//
|
||||||
|
@ -3391,6 +3228,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_0_AARCH64: wtype = GGML_TYPE_Q4_0_AARCH64; break;
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
}
|
}
|
||||||
|
@ -3850,9 +3688,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.name =*/ { 0 },
|
/*.name =*/ { 0 },
|
||||||
/*.extra =*/ NULL,
|
/*.extra =*/ NULL,
|
||||||
///*.padding =*/ { 0 },
|
///*.padding =*/ { 0 },
|
||||||
/*.rearranged_weight_gemv =*/ NULL,
|
|
||||||
/*.rearranged_weight_gemm =*/ NULL,
|
|
||||||
/*.weight_rearranged =*/ false,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __clang__
|
#ifdef __clang__
|
||||||
|
@ -9638,6 +9473,7 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add_q_f32(params, dst);
|
ggml_compute_forward_add_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -10013,6 +9849,7 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add1_q_f32(params, dst);
|
ggml_compute_forward_add1_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -10138,6 +9975,7 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -12340,6 +12178,9 @@ static void ggml_compute_forward_mul_mat(
|
||||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||||
|
ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
|
||||||
|
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||||
|
ggml_gemm_t const gemm = type_traits[type].gemm;
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne01);
|
GGML_ASSERT(ne0 == ne01);
|
||||||
GGML_ASSERT(ne1 == ne11);
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
@ -12405,10 +12246,9 @@ UseGgmlGemm1:;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
if ((type == GGML_TYPE_Q4_0_AARCH64) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
|
||||||
if ((src0->weight_rearranged == true) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
|
|
||||||
for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
|
for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
|
||||||
quantize_row_q8_0_and_make_block_q8_0x4((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4);
|
from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
|
||||||
wdata += row_size * 4;
|
wdata += row_size * 4;
|
||||||
}
|
}
|
||||||
for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
|
for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
|
||||||
|
@ -12416,10 +12256,7 @@ UseGgmlGemm1:;
|
||||||
wdata += row_size;
|
wdata += row_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
else {
|
else {
|
||||||
#endif
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||||
|
@ -12428,9 +12265,7 @@ UseGgmlGemm1:;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||||
|
@ -12509,114 +12344,50 @@ UseGgmlGemm2:;
|
||||||
//if (ith == 0)
|
//if (ith == 0)
|
||||||
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8) && (defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE))
|
if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
|
||||||
if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (src0->weight_rearranged == true)) {
|
gemv(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
|
||||||
if (src0->type == GGML_TYPE_Q4_0) {
|
|
||||||
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
|
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0) {
|
|
||||||
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (src0->weight_rearranged == true)) {
|
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
|
||||||
// use batch-sized 16, 8, and 4 GEMM kernels
|
// use batch-sized 16, 8, and 4 GEMM kernels
|
||||||
if (src0->type == GGML_TYPE_Q4_0) {
|
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
|
||||||
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
|
gemm(ne00, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
int rows_processed = (ne11 / 16) * 16;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
|
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
|
|
||||||
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0) {
|
|
||||||
for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
int rows_processed = (ne11 / 16) * 16;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
|
|
||||||
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (src0->weight_rearranged == true)) {
|
int rows_processed = (ne11 / 16) * 16;
|
||||||
|
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
|
||||||
|
gemm(ne00, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
|
||||||
|
}
|
||||||
|
rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
|
||||||
|
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
||||||
|
gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
||||||
|
}
|
||||||
|
rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
|
||||||
|
for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
|
||||||
|
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
|
||||||
// use batch-sized 8, and 4 GEMM kernels
|
// use batch-sized 8, and 4 GEMM kernels
|
||||||
if (src0->type == GGML_TYPE_Q4_0) {
|
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
|
||||||
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
|
gemm(ne00, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
int rows_processed = (ne11 / 8) * 8;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0) {
|
|
||||||
for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
int rows_processed = (ne11 / 8) * 8;
|
|
||||||
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
|
|
||||||
(const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (src0->weight_rearranged == true)) {
|
int rows_processed = (ne11 / 8) * 8;
|
||||||
|
for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
|
||||||
|
gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
|
||||||
|
}
|
||||||
|
for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
|
||||||
|
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
|
||||||
// use batch-sized 4 GEMM kernel
|
// use batch-sized 4 GEMM kernel
|
||||||
if (src0->type == GGML_TYPE_Q4_0) {
|
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
|
||||||
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
|
gemm(ne00, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
|
||||||
ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
|
}
|
||||||
}
|
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
|
||||||
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
|
gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
||||||
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0) {
|
|
||||||
for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
|
|
||||||
ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
if ((ggml_n_dims(src0) == 2) && (src0->weight_rearranged == true)) {
|
|
||||||
if (src0->type == GGML_TYPE_Q4_0) {
|
|
||||||
for (int row_iter = 0; row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0) {
|
|
||||||
for (int row_iter = 0; row_iter < ne11; row_iter++) {
|
|
||||||
ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
else {
|
else {
|
||||||
#endif
|
|
||||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||||
int current_chunk = ith;
|
int current_chunk = ith;
|
||||||
|
|
||||||
|
@ -12638,9 +12409,7 @@ UseGgmlGemm2:;
|
||||||
|
|
||||||
current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1);
|
current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1);
|
||||||
}
|
}
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_mul_mat_id
|
// ggml_compute_forward_mul_mat_id
|
||||||
|
@ -13051,6 +12820,7 @@ static void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -13236,6 +13006,7 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -13495,6 +13266,7 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_get_rows_q(params, dst);
|
ggml_compute_forward_get_rows_q(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -14081,6 +13853,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -20804,6 +20577,7 @@ size_t ggml_quantize_chunk(
|
||||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64: result = quantize_q4_0_aarch64(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
@ -22238,26 +22012,12 @@ int ggml_cpu_has_matmul_int8(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_sve(void) {
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_sve;
|
return 1;
|
||||||
#elif defined(__ARM_NEON)
|
#else
|
||||||
static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_neon;
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_sve;
|
|
||||||
#elif defined(__ARM_NEON)
|
|
||||||
static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_neon;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
|
|
||||||
_ggml_gemv_q4_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
|
|
||||||
_ggml_gemv_q8_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -162,6 +162,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64 = 33, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
|
@ -3783,6 +3783,7 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||||
|
case GGML_TYPE_Q4_0_AARCH64: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
|
@ -4359,32 +4360,6 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
if ((cur->type == GGML_TYPE_Q4_0) && (cur->ne[1] % 4 == 0)) {
|
|
||||||
cur->weight_rearranged = true;
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
rearrange_q4_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
rearrange_q4_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else if ((cur->type == GGML_TYPE_Q8_0) && (cur->ne[1] % 4 == 0)) {
|
|
||||||
cur->weight_rearranged = true;
|
|
||||||
#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
|
|
||||||
rearrange_q8_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
|
|
||||||
#endif
|
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
rearrange_q8_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
cur->weight_rearranged = false;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
cur->weight_rearranged = false;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
size_done += n_size;
|
size_done += n_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4502,6 +4477,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: return "Q4_0_AARCH64";
|
||||||
|
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
|
@ -17787,6 +17763,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||||
new_type = GGML_TYPE_IQ3_S;
|
new_type = GGML_TYPE_IQ3_S;
|
||||||
}
|
}
|
||||||
|
else if (new_type == GGML_TYPE_Q4_0_AARCH64) {
|
||||||
|
new_type = GGML_TYPE_Q4_0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||||
|
@ -18099,6 +18078,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: default_type = GGML_TYPE_Q4_0_AARCH64; break;
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
@ -18409,6 +18389,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (new_type == GGML_TYPE_Q4_0_AARCH64) {
|
||||||
|
if ((ggml_cpu_has_neon() == 0) && (ggml_cpu_has_sve() == 0)) new_type = GGML_TYPE_Q4_0;
|
||||||
|
if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||||
|
if (nthread > 1) nthread = 1;
|
||||||
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
@ -21702,6 +21688,7 @@ const char * llama_print_system_info(void) {
|
||||||
#else
|
#else
|
||||||
s += "LLAMAFILE = 0 | ";
|
s += "LLAMAFILE = 0 | ";
|
||||||
#endif
|
#endif
|
||||||
|
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue