diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 043320d6c..472f4ace1 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -3,6 +3,7 @@ #include "ggml.h" #include "ggml-cuda.h" +#include #include #if defined(GGML_USE_HIPBLAS) @@ -280,33 +281,6 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne return c; } -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(RDNA3) - c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) - int tmp1; - int tmp2; - asm("\n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - " - : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) - : "v"(a), "v"(b) - ); -#else - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif - return c; -} - #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 // __shfl_xor() for half2 was added in ROCm 5.6 static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { @@ -479,6 +453,46 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half } #endif // CUDART_VERSION < 12000 +static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(RDNA3) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; + +#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + +#if __CUDA_ARCH__ >= MIN_CC_DP4A + return __dp4a(a, b, c); +#else // __CUDA_ARCH__ >= MIN_CC_DP4A + const int8_t * a8 = (const int8_t *) &a; + const int8_t * b8 = (const int8_t *) &b; + return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + // TODO: move to ggml-common.h static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index bd7993595..fdd8f136f 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -72,7 +72,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -120,7 +120,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -179,7 +179,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { @@ -234,7 +234,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( const int u = Q_q8[k_KQ_0/WARP_SIZE]; - const int sumi = __dp4a(v, u, 0); + const int sumi = ggml_cuda_dp4a(v, u, 0); #ifdef FP16_AVAILABLE if (std::is_same::value) { diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index 8ec468cb7..be448c0c7 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -60,8 +60,8 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } const float2 ds8f = __half22float2(ds8); @@ -88,8 +88,8 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } #ifdef GGML_CUDA_F16 @@ -126,14 +126,14 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } const float2 ds8f = __half22float2(ds8); @@ -161,14 +161,14 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } #ifdef GGML_CUDA_F16 @@ -202,7 +202,7 @@ template static __device__ __forceinline__ T vec_dot_q8_0_ #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } return d8_0*d8_1 * ((T) sumi); @@ -220,7 +220,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } #ifdef GGML_CUDA_F16 @@ -259,13 +259,13 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int vi = (v >> (2*i)) & 0x03030303; - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product // fill int with 4x m int m = sc >> 4; m |= m << 8; m |= m << 16; - sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values } const float2 dm2f = __half22float2(dm2); @@ -294,8 +294,8 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + QI8_1/2; ++i) { const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303; - sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product - sumi_m = __dp4a(0x01010101, u[i], sumi_m); + sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product + sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m); } sumf_d += dm2f.x * sumi_d; @@ -339,7 +339,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int vi = __vsubss4(vil, vih); - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d3 * sumf; @@ -363,7 +363,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + QI8_1/2; ++i) { const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404); - sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product + sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product } sumi += sumi_sc * scales[i0 / (QI8_1/2)]; @@ -392,8 +392,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values @@ -423,7 +423,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } const float2 ds8f = __half22float2(ds8[i]); @@ -464,8 +464,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int v0i = vl0i | vh0i; const int v1i = vl1i | vh1i; - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); @@ -496,7 +496,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product } const float2 ds8f = __half22float2(ds8[i]); @@ -535,7 +535,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d*sumf; @@ -558,11 +558,11 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( #pragma unroll for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product - sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product } sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); @@ -857,12 +857,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000); const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0); const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0); - sumi = __dp4a(grid0, u0, sumi); + sumi = ggml_cuda_dp4a(grid0, u0, sumi); const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000); const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1); const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1); - sumi = __dp4a(grid1, u1, sumi); + sumi = ggml_cuda_dp4a(grid1, u1, sumi); } const int ls = aux32 >> 28; @@ -900,11 +900,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); if (l0 < 4) { - sumi0 = __dp4a(grid_l, u0, sumi0); - sumi0 = __dp4a(grid_h, u1, sumi0); + sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0); + sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0); } else { - sumi1 = __dp4a(grid_l, u0, sumi1); - sumi1 = __dp4a(grid_h, u1, sumi1); + sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1); + sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1); } } const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; @@ -950,11 +950,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); if (l0 < 4) { - sumi0 = __dp4a(grid_l, u0, sumi0); - sumi0 = __dp4a(grid_h, u1, sumi0); + sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0); + sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0); } else { - sumi1 = __dp4a(grid_l, u0, sumi1); - sumi1 = __dp4a(grid_h, u1, sumi1); + sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1); + sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1); } } const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4; @@ -991,8 +991,8 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); - sumi = __dp4a(grid_l, u0, sumi); - sumi = __dp4a(grid_h, u1, sumi); + sumi = ggml_cuda_dp4a(grid_l, u0, sumi); + sumi = ggml_cuda_dp4a(grid_h, u1, sumi); } const int ls = aux32 >> 28; @@ -1036,8 +1036,8 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0); const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1); - sumi = __dp4a(grid_l, u0, sumi); - sumi = __dp4a(grid_h, u1, sumi); + sumi = ggml_cuda_dp4a(grid_l, u0, sumi); + sumi = ggml_cuda_dp4a(grid_h, u1, sumi); } sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F); @@ -1069,8 +1069,8 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); - sumi = __dp4a(grid0, u0, sumi); - sumi = __dp4a(grid1, u1, sumi); + sumi = ggml_cuda_dp4a(grid0, u0, sumi); + sumi = ggml_cuda_dp4a(grid1, u1, sumi); } const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1); @@ -1104,13 +1104,13 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); - sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]); - sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]); + sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]); + sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]); const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08); int sumy = 0; - sumy = __dp4a(u0, 0x01010101, sumy); - sumy = __dp4a(u1, 0x01010101, sumy); + sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy); + sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy); sumf[l0/4] += delta*sumy; } @@ -1160,8 +1160,8 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( const int aux_q4 = get_int_b2(bq4->qs, iqs + l); const int2 v = get_int_from_table_16(aux_q4); - sumi = __dp4a(v.x, q8[l + 0], sumi); - sumi = __dp4a(v.y, q8[l + 4], sumi); + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); } const float d = __half2float(bq4->d) * __low2float(bq8_1->ds); @@ -1187,8 +1187,8 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0); const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4); - sumi = __dp4a(v.x, u0, sumi); - sumi = __dp4a(v.y, u1, sumi); + sumi = ggml_cuda_dp4a(v.x, u0, sumi); + sumi = ggml_cuda_dp4a(v.y, u1, sumi); } const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);