diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 3554c9202..658c72fdf 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -259,7 +259,6 @@ #define cublasComputeType_t cudaDataType_t // XXX: Clang builtins mapping -#define __vsubss4 __vsubss4_musa #define __vsub4 __vsub4_musa #define __vcmpeq4 __vcmpeq4_musa #define __vcmpne4 __vcmpne4_musa @@ -372,30 +371,10 @@ typedef float2 dfloat2; #define __has_builtin(x) 0 #endif -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4_musa(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4_musa(a, b); + return __vsubss4(a, b); } static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {