ggml : automatic selection of best CPU backend (#10606)

* ggml : automatic selection of best CPU backend

* amx : minor opt

* add GGML_AVX_VNNI to enable avx-vnni, fix checks
This commit is contained in:
Diego Devesa 2024-12-01 16:12:41 +01:00 committed by GitHub
parent 86dc11c5bc
commit 3420909dff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 599 additions and 156 deletions

View file

@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
# the feature detection code must be compiled without any architecture flags
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
endif()

View file

@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f
int tbegin, tend;
balance211(n, params->nth, params->ith, tbegin, tend);
f(tbegin, tend);
ggml_barrier(params->threadpool); // TODO: might not always be needed
}
// quantized types that have AMX support

View file

@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
__m512 vb[COLS];
__m512 vc[ROWS * COLS];
auto loadc = [&](int idx) {
auto loadc = [&](auto idx) {
vc[idx] = _mm512_setzero_ps();
};
Unroll<ROWS * COLS>{}(loadc);
auto compute = [&](int idx, int k) {
// TODO: use `constexpr` here to get rid of interger div
// when upgraded to C++17
const int row = idx / COLS;
const int col = idx % COLS;
auto compute = [&](auto idx, auto k) {
constexpr int row = idx / COLS;
constexpr int col = idx % COLS;
if (col == 0) {
if constexpr (col == 0) {
va = _mm512_loadu_ps(A + row * K + k);
}
if (row == 0) {
if constexpr (row == 0) {
vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
}
vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
Unroll<ROWS * COLS>{}(compute, k);
}
auto storec = [&](int idx) {
const int row = idx / COLS;
const int col = idx % COLS;
auto storec = [&](auto idx) {
constexpr int row = idx / COLS;
constexpr int col = idx % COLS;
C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
};
Unroll<ROWS * COLS>{}(storec);
@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
const __m512i off = _mm512_set1_epi8(8);
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a and compute compensation
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
vcomp = _mm512_setzero_si512();
for (int k = 0; k < 8; ++k) {
@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
//
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a and add offset 128
if (col == 0) {
if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
// int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
// from {16, 8} to {4, 32}
//
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
}
@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
// Q5_K and Q4_K shares the same vnni formats, refer to notes above.
auto compute = [&](int col, int i) {
auto compute = [&](auto col, auto i) {
// load a
if (col == 0) {
if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
}
@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);
@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i m32s = _mm512_set1_epi32(32);
const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
if (col == 0) {
auto compute = [&](auto col, auto i) {
if constexpr (col == 0) {
// load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
const __m512i values256 = _mm512_add_epi8(values128, off);
auto loadc = [&](int col) {
auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps();
};
Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) {
if (col == 0) {
auto compute = [&](auto col, auto i) {
if constexpr (col == 0) {
// load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
}
//store to C
auto storec = [&](int col) {
auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
};
Unroll<COLS>{}(storec);

View file

@ -0,0 +1,298 @@
#include "ggml-cpu.h"
#include "ggml-backend-impl.h"
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <cstring>
#include <vector>
#include <bitset>
#include <array>
#include <string>
struct cpuid_x86 {
bool SSE3(void) { return f_1_ecx[0]; }
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
bool MONITOR(void) { return f_1_ecx[3]; }
bool SSSE3(void) { return f_1_ecx[9]; }
bool FMA(void) { return f_1_ecx[12]; }
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
bool SSE41(void) { return f_1_ecx[19]; }
bool SSE42(void) { return f_1_ecx[20]; }
bool MOVBE(void) { return f_1_ecx[22]; }
bool POPCNT(void) { return f_1_ecx[23]; }
bool AES(void) { return f_1_ecx[25]; }
bool XSAVE(void) { return f_1_ecx[26]; }
bool OSXSAVE(void) { return f_1_ecx[27]; }
bool AVX(void) { return f_1_ecx[28]; }
bool F16C(void) { return f_1_ecx[29]; }
bool RDRAND(void) { return f_1_ecx[30]; }
bool MSR(void) { return f_1_edx[5]; }
bool CX8(void) { return f_1_edx[8]; }
bool SEP(void) { return f_1_edx[11]; }
bool CMOV(void) { return f_1_edx[15]; }
bool CLFSH(void) { return f_1_edx[19]; }
bool MMX(void) { return f_1_edx[23]; }
bool FXSR(void) { return f_1_edx[24]; }
bool SSE(void) { return f_1_edx[25]; }
bool SSE2(void) { return f_1_edx[26]; }
bool FSGSBASE(void) { return f_7_ebx[0]; }
bool BMI1(void) { return f_7_ebx[3]; }
bool HLE(void) { return is_intel && f_7_ebx[4]; }
bool AVX2(void) { return f_7_ebx[5]; }
bool BMI2(void) { return f_7_ebx[8]; }
bool ERMS(void) { return f_7_ebx[9]; }
bool INVPCID(void) { return f_7_ebx[10]; }
bool RTM(void) { return is_intel && f_7_ebx[11]; }
bool AVX512F(void) { return f_7_ebx[16]; }
bool RDSEED(void) { return f_7_ebx[18]; }
bool ADX(void) { return f_7_ebx[19]; }
bool AVX512PF(void) { return f_7_ebx[26]; }
bool AVX512ER(void) { return f_7_ebx[27]; }
bool AVX512CD(void) { return f_7_ebx[28]; }
bool SHA(void) { return f_7_ebx[29]; }
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
bool LAHF(void) { return f_81_ecx[0]; }
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
bool ABM(void) { return is_amd && f_81_ecx[5]; }
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
bool XOP(void) { return is_amd && f_81_ecx[11]; }
bool TBM(void) { return is_amd && f_81_ecx[21]; }
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
bool AVX512_FP16(void) { return f_7_edx[23]; }
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
bool AMX_TILE(void) { return f_7_edx[24]; }
bool AMX_INT8(void) { return f_7_edx[25]; }
bool AMX_FP16(void) { return f_7_1_eax[21]; }
bool AMX_BF16(void) { return f_7_edx[22]; }
#ifdef _MSC_VER
static void cpuid(int cpu_info[4], int eax) {
__cpuid(cpu_info, eax);
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__cpuidex(cpu_info, eax, ecx);
}
#else
static void cpuid(int cpu_info[4], int eax) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(0));
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(ecx));
}
#endif
cpuid_x86() {
std::array<int, 4> cpui;
std::vector<std::array<int, 4>> data;
// calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
cpuid(cpui.data(), 0);
int n_ids = cpui[0];
for (int i = 0; i <= n_ids; ++i) {
cpuidex(cpui.data(), i, 0);
data.push_back(cpui);
}
// capture vendor string
char vendor[0x20] = {};
*reinterpret_cast<int *>(vendor) = data[0][1];
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
this->vendor = vendor;
if (this->vendor == "GenuineIntel") {
is_intel = true;
} else if (this->vendor == "AuthenticAMD") {
is_amd = true;
}
// load bitset with flags for function 0x00000001
if (n_ids >= 1) {
f_1_ecx = data[1][2];
f_1_edx = data[1][3];
}
// load bitset with flags for function 0x00000007
if (n_ids >= 7) {
f_7_ebx = data[7][1];
f_7_ecx = data[7][2];
f_7_edx = data[7][3];
cpuidex(cpui.data(), 7, 1);
f_7_1_eax = cpui[0];
}
// calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
cpuid(cpui.data(), 0x80000000);
unsigned int n_ex_ids = cpui[0];
std::vector<std::array<int, 4>> ext_data;
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
cpuidex(cpui.data(), i, 0);
ext_data.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (n_ex_ids >= 0x80000001) {
f_81_ecx = ext_data[1][2];
f_81_edx = ext_data[1][3];
}
// interpret CPU brand string if reported
char brand[0x40] = {};
if (n_ex_ids >= 0x80000004) {
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
this->brand = brand;
}
}
bool is_intel = false;
bool is_amd = false;
std::string vendor;
std::string brand;
std::bitset<32> f_1_ecx;
std::bitset<32> f_1_edx;
std::bitset<32> f_7_ebx;
std::bitset<32> f_7_ecx;
std::bitset<32> f_7_edx;
std::bitset<32> f_7_1_eax;
std::bitset<32> f_81_ecx;
std::bitset<32> f_81_edx;
};
#if 0
void test_x86_is() {
cpuid_x86 is;
printf("CPU Vendor: %s\n", is.vendor.c_str());
printf("Brand: %s\n", is.brand.c_str());
printf("is_intel: %d\n", is.is_intel);
printf("is_amd: %d\n", is.is_amd);
printf("sse3: %d\n", is.SSE3());
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
printf("ssse3: %d\n", is.SSSE3());
printf("fma: %d\n", is.FMA());
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
printf("sse41: %d\n", is.SSE41());
printf("sse42: %d\n", is.SSE42());
printf("movbe: %d\n", is.MOVBE());
printf("popcnt: %d\n", is.POPCNT());
printf("aes: %d\n", is.AES());
printf("xsave: %d\n", is.XSAVE());
printf("osxsave: %d\n", is.OSXSAVE());
printf("avx: %d\n", is.AVX());
printf("f16c: %d\n", is.F16C());
printf("rdrand: %d\n", is.RDRAND());
printf("msr: %d\n", is.MSR());
printf("cx8: %d\n", is.CX8());
printf("sep: %d\n", is.SEP());
printf("cmov: %d\n", is.CMOV());
printf("clflush: %d\n", is.CLFSH());
printf("mmx: %d\n", is.MMX());
printf("fxsr: %d\n", is.FXSR());
printf("sse: %d\n", is.SSE());
printf("sse2: %d\n", is.SSE2());
printf("fsgsbase: %d\n", is.FSGSBASE());
printf("bmi1: %d\n", is.BMI1());
printf("hle: %d\n", is.HLE());
printf("avx2: %d\n", is.AVX2());
printf("bmi2: %d\n", is.BMI2());
printf("erms: %d\n", is.ERMS());
printf("invpcid: %d\n", is.INVPCID());
printf("rtm: %d\n", is.RTM());
printf("avx512f: %d\n", is.AVX512F());
printf("rdseed: %d\n", is.RDSEED());
printf("adx: %d\n", is.ADX());
printf("avx512pf: %d\n", is.AVX512PF());
printf("avx512er: %d\n", is.AVX512ER());
printf("avx512cd: %d\n", is.AVX512CD());
printf("sha: %d\n", is.SHA());
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
printf("lahf: %d\n", is.LAHF());
printf("lzcnt: %d\n", is.LZCNT());
printf("abm: %d\n", is.ABM());
printf("sse4a: %d\n", is.SSE4a());
printf("xop: %d\n", is.XOP());
printf("tbm: %d\n", is.TBM());
printf("syscall: %d\n", is.SYSCALL());
printf("mmxext: %d\n", is.MMXEXT());
printf("rdtscp: %d\n", is.RDTSCP());
printf("3dnowext: %d\n", is._3DNOWEXT());
printf("3dnow: %d\n", is._3DNOW());
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
printf("avx512_fp16: %d\n", is.AVX512_FP16());
printf("avx512_bf16: %d\n", is.AVX512_BF16());
printf("amx_tile: %d\n", is.AMX_TILE());
printf("amx_int8: %d\n", is.AMX_INT8());
printf("amx_fp16: %d\n", is.AMX_FP16());
printf("amx_bf16: %d\n", is.AMX_BF16());
}
#endif
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
cpuid_x86 is;
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
// calculate a backend score based on the supported features
// more important features have a higher weight
int score = 0;
score += ggml_cpu_has_fma () * 1;
score += ggml_cpu_has_f16c () * 1<<1;
score += ggml_cpu_has_ssse3 () * 1<<2;
score += ggml_cpu_has_sse3 () * 1<<3;
score += ggml_cpu_has_avx_vnni () * 1<<4;
score += ggml_cpu_has_avx () * 1<<5;
score += ggml_cpu_has_avx2 () * 1<<6;
score += ggml_cpu_has_avx512 () * 1<<7;
// score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
score += ggml_cpu_has_avx512_bf16() * 1<<9;
score += ggml_cpu_has_avx512_vnni() * 1<<10;
score += ggml_cpu_has_amx_int8 () * 1<<11;
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))

View file

@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
}
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
#if defined(__AVX512VNNI__)
const __m512i zero = _mm512_setzero_si512();
return _mm512_dpbusd_epi32(zero, ax, sy);
#else