diff --git a/ggml-aarch64.c b/ggml-aarch64.c index b1a2e0148..834796094 100644 --- a/ggml-aarch64.c +++ b/ggml-aarch64.c @@ -33,11 +33,11 @@ // from bias offset form to pure sign form (this saves subtract // operations durin unpacking) // -static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) { +static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) { block_q4_0x4 out; for (int i = 0; i < 4; i++) { - out.d[i] = in[i]->d; + out.d[i] = in[i].d; } for (int i = 0; i < QK4_0 * 2; i++) { @@ -45,7 +45,7 @@ static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned i int src_id = (i % (4 * block_len)) / block_len; src_offset += (i % block_len); - out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask; + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; } return out; @@ -55,11 +55,11 @@ static block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned i // returns an interleaved block_q4_0x8 // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks // first, then interleave quants from 8 block_q4_0s in blocks of block_len -static block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) { +static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int block_len, unsigned int xor_mask) { block_q4_0x8 out; for (int i = 0; i < 8; i++) { - out.d[i] = in[i]->d; + out.d[i] = in[i].d; } for (int i = 0; i < QK4_0 * 4; i++) { @@ -67,7 +67,7 @@ static block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned i int src_id = (i % (8 * block_len)) / block_len; src_offset += (i % block_len); - out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask; + out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask; } return out; @@ -134,6 +134,8 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3); } } +#else + assert(false); #endif } @@ -222,6 +224,8 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3); } } +#else + assert(false); #endif } @@ -229,45 +233,33 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds assert(n_per_row % QK4_0 == 0); const int nb = n_per_row / QK4_0; - void * out_ptr_B = NULL; - void * out_ptr_B_start = NULL; + void * out_ptr = NULL; if (nrows_interleaved == 8) { - out_ptr_B = (block_q4_0x8 *) malloc(sizeof(block_q4_0x8) * nb); - out_ptr_B_start = out_ptr_B; + out_ptr = (block_q4_0x8 *) dst; } else if (nrows_interleaved == 4) { - out_ptr_B = (block_q4_0x4 *) malloc(sizeof(block_q4_0x4) * nb); - out_ptr_B_start = out_ptr_B; + out_ptr = (block_q4_0x4 *) dst; } - block_q4_0 ** in_ptrs = (block_q4_0 **) malloc(sizeof(block_q4_0 *) * nrows_interleaved); + block_q4_0 dst_tmp[nrows_interleaved]; for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) { - for (int i = 0; i < nrows_interleaved; i++ ) { - in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0; - quantize_row_q4_0_reference(src + b + i * n_per_row, (block_q4_0 *) in_ptrs[i], n_per_row); - } - for (int64_t x = 0; x < nb; x++) { + + for (int i = 0; i < nrows_interleaved; i++ ) { + quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0); + } + if (nrows_interleaved == 8) { - *(block_q4_0x8 *) out_ptr_B = make_block_q4_0x8((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88); - out_ptr_B = (block_q4_0x8 *) out_ptr_B + 1; + *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blocklen_per_row, 0x88); + out_ptr = (block_q4_0x8 *) out_ptr + 1; } else if (nrows_interleaved == 4) { - *(block_q4_0x4 *) out_ptr_B = make_block_q4_0x4((const block_q4_0 * const *) in_ptrs, blocklen_per_row, 0x88); - out_ptr_B = (block_q4_0x4 *) out_ptr_B + 1; - } - - for (int i = 0; i < nrows_interleaved; i++) { - in_ptrs[i]++; + *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blocklen_per_row, 0x88); + out_ptr = (block_q4_0x4 *) out_ptr + 1; } } - out_ptr_B = out_ptr_B_start; - if (nrows_interleaved == 8) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x8) * nb); - else if (nrows_interleaved == 4) memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0x4) * nb); } - if (out_ptr_B_start) free(out_ptr_B_start); - if (in_ptrs) free(in_ptrs); return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0)); } @@ -302,25 +294,24 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_ } } -inline int64_t roundup(const int64_t a, const int64_t b) { - int64_t rem = a % b; +void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; - if (rem) { - return a + b - rem; - } else { - return a; - } -} + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); -void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) if (svcntw() == 8) { @@ -332,19 +323,9 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); #elif defined(__ARM_NEON) - int64_t x0 = roundup((ith * nc) / nth, (int64_t)4); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - - assert(n % 32 == 0); - assert(width % 4 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; __asm__ __volatile__( "movi v31.16b, #0x4\n" @@ -353,7 +334,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx "1:" // Column loop "add x22, %x[a_ptr], #0x2\n" "movi v29.16b, #0x0\n" - "mov x21, %x[num_blocks]\n" + "mov x21, %x[nb]\n" "2:" // Block loop "ldr q28, [%x[b_ptr], #0x0]\n" "ldr q27, [x22, #0x0]\n" @@ -390,26 +371,58 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx "scvtf v26.4s, v26.4s, #0x4\n" "fmla v29.4s, v26.4s, v16.4s\n" "cbnz x21, 2b\n" - "sub %x[width], %x[width], #0x4\n" + "sub %x[nc], %x[nc], #0x4\n" "str q29, [%x[res_ptr], #0x0]\n" "add %x[res_ptr], %x[res_ptr], #0x10\n" - "cbnz %x[width], 1b\n" - : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width) - : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks) + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22" ); +#else + float sumf[4]; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } #endif } -void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); +void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) if (svcntw() == 8) { @@ -418,19 +431,9 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx } #endif #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) - int64_t x0 = roundup((ith * nc) / nth, (int64_t)4); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - - assert(n % 32 == 0); - assert(width % 4 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; __asm__ __volatile__( "movi v2.16b, #0x4\n" @@ -439,7 +442,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx "1:" // Column loop "add x23, %x[a_ptr], #0x2\n" "movi v0.16b, #0x0\n" - "mov x22, %x[num_blocks]\n" + "mov x22, %x[nb]\n" "2:" // Block loop "ldr q31, [%x[b_ptr], #0x0]\n" "ldr q30, [%x[b_ptr], #0x10]\n" @@ -481,46 +484,68 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx "scvtf v29.4s, v29.4s, #0x4\n" "fmla v0.4s, v29.4s, v16.4s\n" "cbnz x22, 2b\n" - "sub %x[width], %x[width], #0x4\n" + "sub %x[nc], %x[nc], #0x4\n" "str q0, [%x[res_ptr], #0x0]\n" "add %x[res_ptr], %x[res_ptr], #0x10\n" - "cbnz %x[width], 1b\n" - : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width) - : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks) + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23" ); #elif defined(__ARM_NEON) GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "performance"); +#else + float sumf[4]; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } #endif } -void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); +void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) if (svcntw() == 8) { - int64_t x0 = roundup((ith * nc) / nth, (int64_t)8); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - - assert(n % 32 == 0); - assert(width % 8 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; __asm__ __volatile__( "ptrue p0.b\n" @@ -528,7 +553,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx "1:" // Column loop "add x22, %x[a_ptr], #0x2\n" "mov z31.b, #0x0\n" - "mov x21, %x[num_blocks]\n" + "mov x21, %x[nb]\n" "2:" // Block loop "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n" "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n" @@ -572,12 +597,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx "scvtf z17.s, p0/m, z17.s\n" "fmla z31.s, p0/M, z17.s, z18.s\n" "cbnz x21, 2b\n" - "sub %x[width], %x[width], #0x8\n" + "sub %x[nc], %x[nc], #0x8\n" "st1w { z31.s }, p0, [%x[res_ptr]]\n" "add %x[res_ptr], %x[res_ptr], #0x20\n" - "cbnz %x[width], 1b\n" - : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width) - : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks) + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); return; @@ -600,18 +625,51 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "performance"); +#else + float sumf[8]; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } #endif } -void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); +void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) if (svcntw() == 8) { @@ -623,36 +681,26 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); #elif defined(__ARM_NEON) - int64_t x0 = roundup((ith * nc) / nth, (int64_t)4); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0/4) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - size_t res_stride = nc * sizeof(float); - - assert(n % 32 == 0); - assert(width % 4 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); __asm__ __volatile__( "mov x10, %x[nr]\n" "mov x9, #0x88\n" "cmp x10, #0x10\n" - "mul x9, %x[num_blocks], x9\n" + "mul x9, %x[nb], x9\n" "blt 4f\n" "1:" // Row loop "add x28, %x[b_ptr], #0x8\n" - "mov x27, %x[width]\n" + "mov x27, %x[nc]\n" "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" "2:" // Column loop "add x25, %x[a_ptr], #0x8\n" "movi v15.16b, #0x0\n" "movi v19.16b, #0x0\n" - "mov x24, %x[num_blocks]\n" + "mov x24, %x[nb]\n" "add x23, x25, x9\n" "movi v18.16b, #0x0\n" "movi v14.16b, #0x0\n" @@ -972,13 +1020,13 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx "cbz x10, 9f\n" "5:" // Row tail: Row loop "add x24, %x[b_ptr], #0x8\n" - "mov x23, %x[width]\n" + "mov x23, %x[nc]\n" "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" "6:" // Row tail: Column loop "movi v15.16b, #0x0\n" "movi v19.16b, #0x0\n" "add x25, %x[a_ptr], #0x8\n" - "mov x21, %x[num_blocks]\n" + "mov x21, %x[nb]\n" "movi v18.16b, #0x0\n" "movi v14.16b, #0x0\n" "7:" // Row tail: Block loop @@ -1085,21 +1133,63 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, const void * restrict vx "bgt 5b\n" "9:" // Row tail: Row loop skip : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); +#else + float sumf[4][4]; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } #endif } -void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); +void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) if (svcntw() == 8) { @@ -1108,36 +1198,26 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx } #endif #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) - int64_t x0 = roundup((ith * nc) / nth, (int64_t)4); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)4); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x4 *) vx + ((x0 / 4) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - size_t res_stride = nc * sizeof(float); - - assert(n % 32 == 0); - assert(width % 4 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); __asm__ __volatile__( "mov x10, %x[nr]\n" "mov x9, #0x88\n" "cmp x10, #0x10\n" - "mul x9, %x[num_blocks], x9\n" + "mul x9, %x[nb], x9\n" "blt 4f\n" "1:" // Row loop "add x28, %x[b_ptr], #0x8\n" - "mov x27, %x[width]\n" + "mov x27, %x[nc]\n" "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" "2:" // Column loop "add x25, %x[a_ptr], #0x8\n" "movi v2.16b, #0x0\n" "movi v10.16b, #0x0\n" - "mov x24, %x[num_blocks]\n" + "mov x24, %x[nb]\n" "add x23, x25, x9\n" "movi v12.16b, #0x0\n" "movi v28.16b, #0x0\n" @@ -1409,13 +1489,13 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx "cbz x10, 9f\n" "5:" // Row tail: Row loop "add x24, %x[b_ptr], #0x8\n" - "mov x23, %x[width]\n" + "mov x23, %x[nc]\n" "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" "6:" // Row tail: Column loop "movi v2.16b, #0x0\n" "movi v10.16b, #0x0\n" "add x25, %x[a_ptr], #0x8\n" - "mov x21, %x[num_blocks]\n" + "mov x21, %x[nb]\n" "movi v12.16b, #0x0\n" "movi v28.16b, #0x0\n" "7:" // Row tail: Block loop @@ -1510,42 +1590,74 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, const void * restrict vx "bgt 5b\n" "9:" // Row tail: Row loop skip : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); #elif defined(__ARM_NEON) GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "performance"); +#else + float sumf[4][4]; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } #endif } -void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy, int nr, int nc, int ith, int nth) { - UNUSED(n); +void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + UNUSED(s); + UNUSED(bs); UNUSED(vx); UNUSED(vy); UNUSED(nr); UNUSED(nc); - UNUSED(ith); - UNUSED(nth); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) if (svcntw() == 8) { - int64_t x0 = roundup((ith * nc) / nth, (int64_t)8); - int64_t xend = roundup(((ith + 1) * nc) / nth, (int64_t)8); - size_t width = xend - x0; - - int64_t nb = n / QK4_0; - const void * b_ptr = (const void *)((const block_q4_0x8 *) vx + ((x0 / 8) * nb)); + const void * b_ptr = vx; const void * a_ptr = vy; - float * res_ptr = s + x0; - size_t res_stride = nc * sizeof(float); - - assert(n % 32 == 0); - assert(width % 8 == 0); - - size_t num_blocks = n / 32; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); __asm__ __volatile__( "mov x20, #0x4\n" @@ -1555,17 +1667,17 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx "ptrue p1.b\n" "whilelt p0.s, XZR, x20\n" "cmp x13, #0x10\n" - "mul x12, %x[num_blocks], x12\n" + "mul x12, %x[nb], x12\n" "blt 4f\n" "1:" // Row loop "add x11, %x[b_ptr], #0x10\n" - "mov x10, %x[width]\n" + "mov x10, %x[nc]\n" "add x9, %x[res_ptr], %x[res_stride], LSL #4\n" "2:" // Column loop "add x28, %x[a_ptr], #0x8\n" "mov z24.b, #0x0\n" "mov z15.b, #0x0\n" - "mov x27, %x[num_blocks]\n" + "mov x27, %x[nb]\n" "add x26, x28, x12\n" "mov z12.b, #0x0\n" "mov z0.b, #0x0\n" @@ -1844,13 +1956,13 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx "cbz x13, 9f\n" "5:" // Row tail: Row loop "add x25, %x[b_ptr], #0x10\n" - "mov x24, %x[width]\n" + "mov x24, %x[nc]\n" "add x23, %x[res_ptr], %x[res_stride], LSL #2\n" "6:" // Row tail: Column loop "mov z24.b, #0x0\n" "mov z15.b, #0x0\n" "add x28, %x[a_ptr], #0x8\n" - "mov x22, %x[num_blocks]\n" + "mov x22, %x[nb]\n" "mov z12.b, #0x0\n" "mov z0.b, #0x0\n" "7:" // Row tail: Block loop @@ -1946,7 +2058,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx "bgt 5b\n" "9:" // Row tail: Row loop skip : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); return; @@ -1969,5 +2081,37 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, const void * restrict vx GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "performance"); +#else + float sumf[4][8]; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + int sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } #endif } diff --git a/ggml-aarch64.h b/ggml-aarch64.h index d4d4dd01b..53f9d518d 100644 --- a/ggml-aarch64.h +++ b/ggml-aarch64.h @@ -22,14 +22,14 @@ size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT d size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); // GEMV -void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); -void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); -void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); +void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); // GEMM -void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); -void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); -void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, int ith, int nth); +void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); #ifdef __cplusplus } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7cfd74a7e..0c526c47e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2397,7 +2397,6 @@ extern "C" { GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_matmul_int8(void); - GGML_API int ggml_cpu_has_sve (void); // // Internal types and functions exposed for tests and benchmarks @@ -2414,10 +2413,10 @@ extern "C" { typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, - int nr, int nc, int ith, int nth); - typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, - int nr, int nc, int ith, int nth); + typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, + const void * GGML_RESTRICT vy, int nr, int nc); + typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, + const void * GGML_RESTRICT vy, int nr, int nc); typedef struct { const char * type_name; @@ -2430,6 +2429,7 @@ extern "C" { ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; int64_t nrows; // number of rows to process simultaneously; + int64_t ncols; // number of columns to process simultaneously; ggml_from_float_to_mat_t from_float_to_mat; ggml_gemv_t gemv; ggml_gemm_t gemm; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7400a0ec0..1f6b5127d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -916,6 +916,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = NULL, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, + .ncols = 4, .gemv = ggml_gemv_q4_0_4x4_q8_0, .gemm = ggml_gemm_q4_0_4x4_q8_0, }, @@ -930,6 +931,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = NULL, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, + .ncols = 4, .gemv = ggml_gemv_q4_0_4x8_q8_0, .gemm = ggml_gemm_q4_0_4x8_q8_0, }, @@ -944,6 +946,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = NULL, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, + .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, } @@ -12203,6 +12206,7 @@ static void ggml_compute_forward_mul_mat( enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; int64_t const vec_dot_num_rows = type_traits[type].nrows; + int64_t const matmul_num_cols = type_traits[type].ncols; ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat; ggml_gemv_t const gemv = type_traits[type].gemv; @@ -12372,32 +12376,49 @@ UseGgmlGemm2:; const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; + src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; + if ((ggml_n_dims(src0) == 2) && gemm && gemv) { - if (ne11 == 1) gemv(ne00, (float *)((char *) dst->data), (const char *) src0->data, (const char *) src1_wdata, 1, ne01, ith, nth); + if (src0_start >= src0_end) return; + if (ne11 == 1) + gemv(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, 1, src0_end - src0_start); else { for (int iter = 0; iter < ne11 / 16; iter++) { - gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16, ne01, ith, nth); + gemm(ne00, (float *)((char *) dst->data + (iter * 16 * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter * 16), 16, + src0_end - src0_start); } int rows_processed = (ne11 / 16) * 16; for (int iter = 0; iter < (ne11 - rows_processed) / 8; iter++) { - gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, ne01, ith, nth); + gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 8) * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 8)), 8, src0_end - src0_start); } rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8; for (int iter = 0; iter < (ne11 - rows_processed) / 4; iter++) { - gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, ne01, ith, nth); + gemm(ne00, (float *)((char *) dst->data + ((rows_processed + iter * 4) * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * (rows_processed + iter * 4)), 4, src0_end - src0_start); } rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4; for (int iter = rows_processed; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth); + gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); } } - } - else if ((ggml_n_dims(src0) == 2) && gemv) { + } else if ((ggml_n_dims(src0) == 2) && gemv) { + if (src0_start >= src0_end) return; for (int iter = 0; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)), (const char *) src0->data, (const char *) src1_wdata + (src1_col_stride * iter), 1, ne01, ith, nth); + gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); } - } - else { + } else { // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; diff --git a/src/llama.cpp b/src/llama.cpp index 0adb0afae..22cd387c5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21693,7 +21693,6 @@ const char * llama_print_system_info(void) { #else s += "LLAMAFILE = 0 | "; #endif - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; return s.c_str(); }