Add intrinsics polyfills for AMD

---------

Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
This commit is contained in:
Henri Vasserman 2023-08-09 01:56:44 +03:00
parent ab6212864c
commit 4024f91a66
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
3 changed files with 38 additions and 17 deletions

View file

@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)

View file

@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(CXX) $(CXXFLAGS) -x hip -c -o $@ $< $(CXX) $(CXXFLAGS) -x hip -c -o $@ $<

View file

@ -75,6 +75,29 @@
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
return reinterpret_cast<const int&>(c);
}
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c;
}
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
return; return;
} }
y[ib].ds.x = d; reinterpret_cast<half&>(y[ib].ds.x) = d;
y[ib].ds.y = sum; reinterpret_cast<half&>(y[ib].ds.y) = sum;
} }
template <int qk, int qr, dequantize_kernel_t dequantize_kernel> template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@ -1609,8 +1632,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
#else #else
const float2 dm8f = __half22float2(dm8); const float2 dm8f = __half22float2(dm8);
const float2 ds8f = __half22float2(ds8); const float2 ds8f = __half22float2(ds8);
const float d8d8 = dm8.x * ds8.x; const float d8d8 = __low2float(dm8) * __low2float(ds8);
const float m8s8 = dm8.y * ds8.y; const float m8s8 = __high2float(dm8) * __high2float(ds8);
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
} }
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x); return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
} }
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR2_K; ++ i) { for (int i = 0; i < QR2_K; ++ i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
} }
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR3_K; ++i) { for (int i = 0; i < QR3_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
} }
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
for (int i = 0; i < QR4_K; ++i) { for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x; d8[i] = __low2half(bq8i->ds);
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0]; u[2*i+0] = q8[0];
@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const float dall = bq4_K->d[0]; const float dall = bq4_K->d[0];
const float dmin = bq4_K->d[1]; const float dmin = bq4_K->d[1];
const float d8_1 = bq8_1[0].ds.x; const float d8_1 = __low2float(bq8_1[0].ds);
const float d8_2 = bq8_1[1].ds.x; const float d8_2 = __low2float(bq8_1[1].ds);
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR5_K; ++i) { for (int i = 0; i < QR5_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x; d8[i] = __low2float(bq8i->ds);
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0]; u[2*i+0] = q8[0];
@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const float d = bq5_K->d; const float d = bq5_K->d;
const float d8_1 = bq8_1[0].ds.x; const float d8_1 = __low2half(bq8_1[0].ds);
const float d8_2 = bq8_1[1].ds.x; const float d8_2 = __low2half(bq8_1[1].ds);
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR6_K; ++i) { for (int i = 0; i < QR6_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + 2*i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
} }
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q(
*dsi_dst = *dsi_src; *dsi_dst = *dsi_src;
} else { } else {
float * dfi_dst = (float *) dsi_dst; float * dfi_dst = (float *) dsi_dst;
*dfi_dst = (*dsi_src).x; *dfi_dst = __low2half(*dsi_src);
} }
} }