change to one time init to prevent performance drop

2024-07-27 02:13:24 +09:00 · 2024-07-27 02:13:24 +09:00 · ab8b748188
commit ab8b748188
parent 7cc64329d7
5 changed files with 36 additions and 18 deletions
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@ -3739,6 +3739,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #if defined(__GNUC__) || defined(__clang__)
 #  if defined(__ARM_FEATURE_SVE)
 #    include <arm_sve.h>
+#    include <sys/prctl.h>
+int sve_cnt_b = 0;
 #  endif
 #  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
   || (defined(_M_ARM) && _M_ARM >= 7) \
@ -5495,7 +5497,9 @@ XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
    const uint64_t *xinput = (const uint64_t *)(const void *)input;
    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
-    uint64_t element_count = svcntd();
+    if (!sve_cnt_b)
+        sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    uint64_t element_count = sve_cnt_b / 8;
    if (element_count >= 8) {
        svbool_t mask = svptrue_pat_b64(SV_VL8);
        svuint64_t vacc = svld1_u64(mask, xacc);
@ -5537,7 +5541,9 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
        const uint64_t *xinput = (const uint64_t *)(const void *)input;
        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
-        uint64_t element_count = svcntd();
+        if (!sve_cnt_b)
+            sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+        uint64_t element_count = sve_cnt_b / 8;
        if (element_count >= 8) {
            svbool_t mask = svptrue_pat_b64(SV_VL8);
            svuint64_t vacc = svld1_u64(mask, xacc + 0);
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if ((PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL)) == QK8_0) {
+    if (sve_cnt_b == QK8_0) {
        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

@ -5291,7 +5291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if ((PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL)) == QK8_0) {
+    if (sve_cnt_b == QK8_0) {
        svfloat32_t sumv0 = svdup_n_f32(0.0f);
        svfloat32_t sumv1 = svdup_n_f32(0.0f);

--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);

+#if defined(__ARM_FEATURE_SVE)
+extern int sve_cnt_b;
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -37,6 +37,9 @@
 #include <unistd.h>
 #endif

+#if defined(__ARM_FEATURE_SVE)
+int sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@ -3475,6 +3478,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

    ggml_assert_aligned(ctx->mem_buffer);

+#if defined(__ARM_FEATURE_SVE)
+    if (!sve_cnt_b)
+        sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+
    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

    ggml_critical_section_end();