diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a79e36953..92f9309b7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -196,6 +196,17 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) { return __vsubss4(a, b); } +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false);