Merge branch 'patch-2' of https://github.com/FirstTimeEZ/llama.cpp into patch-2
This commit is contained in:
commit
1cb0e11529
11 changed files with 148 additions and 77 deletions
|
@ -1,4 +1,4 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
|
|
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
|
@ -414,6 +414,27 @@ jobs:
|
|||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . -DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
|
|
8
Makefile
8
Makefile
|
@ -359,6 +359,10 @@ ifdef LLAMA_SERVER_SSL
|
|||
MK_LDFLAGS += -lssl -lcrypto
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
endif
|
||||
|
||||
# warnings
|
||||
WARN_FLAGS = \
|
||||
-Wall \
|
||||
|
@ -940,10 +944,6 @@ ggml/src/ggml-cuda/%.o: \
|
|||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
|
||||
endif # GGML_MUSA
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
endif
|
||||
|
||||
ifdef GGML_METAL
|
||||
MK_CPPFLAGS += -DGGML_USE_METAL
|
||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
|
|
|
@ -143,14 +143,23 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||
if (GGML_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
||||
|
@ -211,10 +220,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
|
||||
OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND ${POWER10_M} "POWER10" substring_index)
|
||||
if(${substring_index} GREATER_EQUAL 0)
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
|
|
|
@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
|
|||
#endif
|
||||
}
|
||||
#elif defined(__AVX__)
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
|
||||
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
||||
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
||||
uint32_t x32;
|
||||
|
@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|||
return sum_i16_pairs_float(doth, dotl);
|
||||
}
|
||||
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
|
||||
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
|
||||
const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
|
||||
const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
|
||||
return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
// quad fp16 delta calculation
|
||||
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
||||
// GGML_FP16_TO_FP32 is faster than Intel F16C
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
|
@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
||||
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
|
||||
const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
|
||||
const __m256 p = sum_i16_pairs_float(p_2, p_1);
|
||||
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__SSSE3__)
|
||||
// set constants
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
|
@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
|
@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
#if defined(__AVX2__)
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
#else
|
||||
acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
||||
const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
|
||||
const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
|
||||
const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
||||
const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
|
||||
const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
||||
const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
size_t vl = __riscv_vsetvl_e8m1(qk);
|
||||
|
||||
|
@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
#elif defined __AVX__
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
|
|
|
@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#if defined(__AVX2__)
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
||||
#else
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
||||
#endif
|
||||
__m256 c1 = _mm256_setzero_ps();
|
||||
__m256 c2 = _mm256_setzero_ps();
|
||||
__m256 c3 = _mm256_setzero_ps();
|
||||
|
|
|
@ -1763,7 +1763,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
|
||||
idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
||||
|
@ -1821,8 +1822,7 @@ void ggml_vk_instance_init() {
|
|||
};
|
||||
validation_features.setPNext(nullptr);
|
||||
instance_create_info.setPNext(&validation_features);
|
||||
|
||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
|
||||
}
|
||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||
|
||||
|
@ -1936,8 +1936,8 @@ void ggml_vk_instance_init() {
|
|||
vk_instance.device_indices.push_back(0);
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
|
||||
|
||||
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
||||
ggml_vk_print_gpu_info(i);
|
||||
|
|
|
@ -49,6 +49,14 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
#else
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
#endif
|
||||
|
||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||
float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
|
|
|
@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
|
|||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
|
||||
"embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
|
||||
"split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||
"main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
|
||||
"use_mmap": "Use mmap", "embeddings": "Embeddings",
|
||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||
"model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
|
||||
"no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
|
||||
}
|
||||
|
||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
||||
|
|
|
@ -1 +1 @@
|
|||
8a3d799484d861748f86eb87c8314fa2dbccc254
|
||||
9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue