Merge branch 'ggerganov:master' into vulkan-cmake-preset
This commit is contained in:
commit
a9114b358a
6 changed files with 115 additions and 61 deletions
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
|
@ -414,6 +414,27 @@ jobs:
|
|||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . -DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
|
|
|
@ -211,10 +211,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
|
||||
OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND ${POWER10_M} "POWER10" substring_index)
|
||||
if(${substring_index} GREATER_EQUAL 0)
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
|
|
|
@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
|
|||
#endif
|
||||
}
|
||||
#elif defined(__AVX__)
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
|
||||
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
||||
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
||||
uint32_t x32;
|
||||
|
@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|||
return sum_i16_pairs_float(doth, dotl);
|
||||
}
|
||||
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
|
||||
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
|
||||
const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
|
||||
const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
|
||||
return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
// quad fp16 delta calculation
|
||||
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
||||
// GGML_FP16_TO_FP32 is faster than Intel F16C
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
|
@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
||||
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
|
||||
const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
|
||||
const __m256 p = sum_i16_pairs_float(p_2, p_1);
|
||||
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__SSSE3__)
|
||||
// set constants
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
|
@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
|
@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
#if defined(__AVX2__)
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
#else
|
||||
acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
||||
const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
|
||||
const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
|
||||
const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
||||
const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
|
||||
const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
||||
const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
size_t vl = __riscv_vsetvl_e8m1(qk);
|
||||
|
||||
|
@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
#elif defined __AVX__
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
|
|
|
@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#if defined(__AVX2__)
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
||||
#else
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
||||
#endif
|
||||
__m256 c1 = _mm256_setzero_ps();
|
||||
__m256 c2 = _mm256_setzero_ps();
|
||||
__m256 c3 = _mm256_setzero_ps();
|
||||
|
|
|
@ -1763,7 +1763,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
|
||||
idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
||||
|
@ -1821,8 +1822,7 @@ void ggml_vk_instance_init() {
|
|||
};
|
||||
validation_features.setPNext(nullptr);
|
||||
instance_create_info.setPNext(&validation_features);
|
||||
|
||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
|
||||
}
|
||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||
|
||||
|
@ -1936,8 +1936,8 @@ void ggml_vk_instance_init() {
|
|||
vk_instance.device_indices.push_back(0);
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
|
||||
|
||||
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
||||
ggml_vk_print_gpu_info(i);
|
||||
|
|
|
@ -1 +1 @@
|
|||
8a3d799484d861748f86eb87c8314fa2dbccc254
|
||||
9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue