Add intrinsics polyfills for AMD
--------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
This commit is contained in:
parent
ab6212864c
commit
4024f91a66
3 changed files with 38 additions and 17 deletions
|
@ -379,7 +379,6 @@ if (LLAMA_HIPBLAS)
|
|||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
|
||||
|
|
1
Makefile
1
Makefile
|
@ -302,7 +302,6 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
|||
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
|
||||
ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
|
||||
|
|
53
ggml-cuda.cu
53
ggml-cuda.cu
|
@ -75,6 +75,29 @@
|
|||
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||
return reinterpret_cast<const int&>(c);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
return;
|
||||
}
|
||||
|
||||
y[ib].ds.x = d;
|
||||
y[ib].ds.y = sum;
|
||||
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
|
@ -1609,8 +1632,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
#else
|
||||
const float2 dm8f = __half22float2(dm8);
|
||||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d8d8 = dm8.x * ds8.x;
|
||||
const float m8s8 = dm8.y * ds8.y;
|
||||
const float d8d8 = __low2float(dm8) * __low2float(ds8);
|
||||
const float m8s8 = __high2float(dm8) * __high2float(ds8);
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||
|
@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|||
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
||||
}
|
||||
|
||||
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
||||
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
|
@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR2_K; ++ i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
||||
|
@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR3_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
||||
|
@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
|
||||
for (int i = 0; i < QR4_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds.x;
|
||||
d8[i] = __low2half(bq8i->ds);
|
||||
|
||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||
u[2*i+0] = q8[0];
|
||||
|
@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
const float dall = bq4_K->d[0];
|
||||
const float dmin = bq4_K->d[1];
|
||||
|
||||
const float d8_1 = bq8_1[0].ds.x;
|
||||
const float d8_2 = bq8_1[1].ds.x;
|
||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
|
@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR5_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds.x;
|
||||
d8[i] = __low2float(bq8i->ds);
|
||||
|
||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||
u[2*i+0] = q8[0];
|
||||
|
@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
|
||||
const float d = bq5_K->d;
|
||||
|
||||
const float d8_1 = bq8_1[0].ds.x;
|
||||
const float d8_2 = bq8_1[1].ds.x;
|
||||
const float d8_1 = __low2half(bq8_1[0].ds);
|
||||
const float d8_2 = __low2half(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
|
@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
||||
|
@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q(
|
|||
*dsi_dst = *dsi_src;
|
||||
} else {
|
||||
float * dfi_dst = (float *) dsi_dst;
|
||||
*dfi_dst = (*dsi_src).x;
|
||||
*dfi_dst = __low2half(*dsi_src);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue