diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 08aef929a..5a92185c0 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -1,7 +1,3 @@ -#define DPCT_PROFILING_ENABLED -#define DPCT_COMPAT_RT_VERSION 12010 -#include -#include #include #include #include @@ -13,113 +9,21 @@ #include #include #include +#include + + +#include +#include +#include #include - -#if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#ifdef GGML_HIP_UMA -#define cudaMalloc hipMallocManaged -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) -#else -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#endif -#define cudaMemcpy hipMemcpy -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap abort -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#else - -#if DPCT_COMPAT_RT_VERSION < 11020 -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - -#endif // defined(GGML_USE_HIPBLAS) +#include #include "ggml-cuda.h" #include "ggml.h" #include "ggml-backend-impl.h" -#include - -#include static int g_ggml_sycl_debug=0; -//#define GGML_SYCL_DEBUG(...) (if(g_ggml_sycl_debug) printf(__VA_ARGS__)) #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0) #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products @@ -147,71 +51,6 @@ static int g_ggml_sycl_debug=0; // max batch size to use MMQ kernels when tensor cores are available #define MMQ_MAX_BATCH_SIZE 32 -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(__gfx1100__) - c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) - int tmp1; - int tmp2; - asm("\n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - " - : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) - : "v"(a), "v"(b) - ); -#else - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif - return c; -} -#endif // defined(GGML_USE_HIPBLAS) #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -257,7 +96,7 @@ You need to rewrite this code. ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ cublas_get_error_str(err_)); } while (0) -#if !defined(GGML_USE_HIPBLAS) + static const char *cu_get_error_str(int err) { const char * err_str; /* @@ -276,7 +115,7 @@ DPCT1000:68: Error handling if-stmt was detected but could not be rewritten. do { auto err_ = (err); \ if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ cu_get_error_str(err_)); } while (0) -#endif + #if DPCT_COMPAT_RT_VERSION >= 11100 #define GGML_CUDA_ASSUME(x) __builtin_assume(x) @@ -284,13 +123,14 @@ DPCT1000:68: Error handling if-stmt was detected but could not be rewritten. #define GGML_CUDA_ASSUME(x) #endif // CUDART_VERSION >= 11100 -#ifdef GGML_CUDA_F16 -typedef half dfloat; // dequantize float -typedef half2 dfloat2; +#ifdef GGML_SYCL_F16 +typedef sycl::half dfloat; // dequantize float +typedef sycl::half2 dfloat2; #else typedef float dfloat; // dequantize float typedef sycl::float2 dfloat2; -#endif //GGML_CUDA_F16 +#endif //GGML_SYCL_F16 + static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) { const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment @@ -1080,13 +920,16 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, v.x() = vui & 0xF; v.y() = vui >> 4; -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {8.0f, 8.0f}); - v = __hmul2(v, {d, d}); +#ifdef GGML_SYCL_F16 + // v = v - {8.0f, 8.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 8.0f) * d; + v.s1() = (v.s1() - 8.0f) * d; + #else v.x() = (v.x() - 8.0f) * d; v.y() = (v.y() - 8.0f) * d; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, @@ -1101,13 +944,16 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, v.x() = vui & 0xF; v.y() = vui >> 4; -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; + #else v.x() = (v.x() * d) + m; v.y() = (v.y() * d) + m; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, @@ -1125,13 +971,16 @@ static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {16.0f, 16.0f}); - v = __hmul2(v, {d, d}); +#ifdef GGML_SYCL_F16 + // v = v - {16.0f, 16.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 16.0f) * d; + v.s1() = (v.s1() - 16.0f) * d; + #else v.x() = (v.x() - 16.0f) * d; v.y() = (v.y() - 16.0f) * d; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, @@ -1150,13 +999,15 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; #else v.x() = (v.x() * d) + m; v.y() = (v.y() * d) + m; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, @@ -1168,12 +1019,14 @@ static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, v.x() = x[ib].qs[iqs + 0]; v.y() = x[ib].qs[iqs + 1]; -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + v.s0() *= d; + v.s1() *= d; #else v.x() *= d; v.y() *= d; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } //================================== k-quants @@ -2227,11 +2080,7 @@ template static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4, const sycl::half2 &ds8) { - -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; - #pragma unroll for (int i = 0; i < vdr; ++i) { const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; @@ -2247,9 +2096,6 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q4_1_Q8_1_MMVQ 2 @@ -2260,8 +2106,6 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, const sycl::half2 &ds8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2274,10 +2118,11 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); } -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm4, ds8)); - const float d4d8 = tmp.x; - const float m4s8 = tmp.y; +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm4 * ds8).convert(); + const float d4d8 = tmp.x(); + const float m4s8 = tmp.y(); #else const sycl::float2 dm4f = dm4.convert(); @@ -2285,13 +2130,10 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, ds8.convert(); const float d4d8 = dm4f.x() * ds8f.x(); const float m4s8 = dm4f.y() * ds8f.y(); -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_0_Q8_1_MMVQ 2 @@ -2301,9 +2143,6 @@ template static __dpct_inline__ float vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, const float &d5, const sycl::half2 &ds8) { - -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2330,9 +2169,6 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_1_Q8_1_MMVQ 2 @@ -2343,8 +2179,6 @@ static __dpct_inline__ float vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, const sycl::half2 &dm5, const sycl::half2 &ds8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2366,10 +2200,13 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, sumi); // SIMD dot product of quantized values } -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm5, ds8)); - const float d5d8 = tmp.x; - const float m5s8 = tmp.y; +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm5 * ds8).convert(); + const float d5d8 = tmp.x(); + const float m5s8 = tmp.y(); + + #else const sycl::float2 dm5f = dm5.convert(); @@ -2377,14 +2214,10 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, ds8.convert(); const float d5d8 = dm5f.x() * ds8f.x(); const float m5s8 = dm5f.y() * ds8f.y(); -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it return sumi*d5d8 + m5s8 / (QI5_1 / vdr); - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q8_0_Q8_1_MMVQ 2 @@ -2395,8 +2228,6 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0, const float &d8_1) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2406,9 +2237,6 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, } return d8_0*d8_1 * sumi; -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template @@ -2416,8 +2244,6 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, const sycl::half2 &ds8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2426,10 +2252,11 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, sumi = dpct::dp4a(v[i], u[i], sumi); } -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm8, ds8)); - const float d8d8 = tmp.x; - const float m8s8 = tmp.y; +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm8 * ds8).convert(); + const float d8d8 = tmp.x(); + const float m8s8 = tmp.y(); #else const sycl::float2 dm8f = dm8.convert(); @@ -2437,13 +2264,10 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, ds8.convert(); const float d8d8 = dm8f.x() * ds8f.x(); const float m8s8 = dm8f.y() * ds8f.y(); -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q2_K_Q8_1_MMVQ 1 @@ -2454,8 +2278,6 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, const sycl::half2 &dm2, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -2482,9 +2304,6 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( dm2.convert(); return dm2f.x() * sumf_d - dm2f.y() * sumf_m; -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -2493,8 +2312,6 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const uint8_t *__restrict__ scales, const sycl::half2 &dm2, const float &d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi_d = 0; int sumi_m = 0; @@ -2523,9 +2340,6 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, dm2.convert(); return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q3_K_Q8_1_MMVQ 1 @@ -2537,8 +2351,6 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( const uint8_t *__restrict__ scales, const int &scale_offset, const float &d3, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf = 0.0f; #pragma unroll @@ -2566,9 +2378,6 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( } return d3 * sumf; -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -2577,8 +2386,6 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const int8_t *__restrict__ scales, const float &d3, const float &d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; #pragma unroll @@ -2593,9 +2400,6 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, } return d3*d8 * sumi; -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q4_K_Q8_1_MMVQ 2 @@ -2607,8 +2411,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, const sycl::half2 &dm4, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -2632,10 +2434,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( dm4.convert(); return dm4f.x() * sumf_d - dm4f.y() * sumf_m; - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -2644,8 +2442,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -2670,10 +2466,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( dm4.convert(); return dm4f.x() * sumf_d - dm4f.y() * sumf_m; - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q5_K_Q8_1_MMVQ 2 @@ -2686,8 +2478,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( const uint8_t *__restrict__ m, const sycl::half2 &dm5, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -2718,10 +2508,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( dm5.convert(); return dm5f.x() * sumf_d - dm5f.y() * sumf_m; - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -2730,8 +2516,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -2756,10 +2540,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( dm4.convert(); return dm4f.x() * sumf_d - dm4f.y() * sumf_m; - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } #define VDR_Q6_K_Q8_1_MMVQ 1 @@ -2772,8 +2552,6 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, const int8_t *__restrict__ scales, const float &d, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf = 0.0f; #pragma unroll @@ -2791,9 +2569,6 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, } return d*sumf; -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values @@ -2802,8 +2577,6 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const int8_t *__restrict__ sc, const float &d6, const float *__restrict__ d8) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics float sumf_d = 0.0f; #pragma unroll @@ -2828,10 +2601,6 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, } return d6 * sumf_d; - -#else - bad_arch(); -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __dpct_inline__ float @@ -4345,11 +4114,6 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, #define NWARPS_Q4_0_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4360,24 +4124,8 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_0_RDNA2; - const int mmq_y = MMQ_Y_Q4_0_RDNA2; - const int nwarps = NWARPS_Q4_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_0_RDNA1; - const int mmq_y = MMQ_Y_Q4_0_RDNA1; - const int nwarps = NWARPS_Q4_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +//sycl_todo: change according to hardware - allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA const int mmq_x = MMQ_X_Q4_0_AMPERE; const int mmq_y = MMQ_Y_Q4_0_AMPERE; const int nwarps = NWARPS_Q4_0_AMPERE; @@ -4388,20 +4136,6 @@ template static void vec_dot_q4_0_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_0_PASCAL; - const int mmq_y = MMQ_Y_Q4_0_PASCAL; - const int nwarps = NWARPS_Q4_0_PASCAL; - - allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q4_0_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q4_1_RDNA2 64 @@ -4424,13 +4158,6 @@ template static void #define NWARPS_Q4_1_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4441,23 +4168,7 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_1_RDNA2; - const int mmq_y = MMQ_Y_Q4_1_RDNA2; - const int nwarps = NWARPS_Q4_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_1_RDNA1; - const int mmq_y = MMQ_Y_Q4_1_RDNA1; - const int nwarps = NWARPS_Q4_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q4_1_AMPERE; const int mmq_y = MMQ_Y_Q4_1_AMPERE; const int nwarps = NWARPS_Q4_1_AMPERE; @@ -4468,19 +4179,6 @@ template static void vec_dot_q4_1_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_1_PASCAL; - const int mmq_y = MMQ_Y_Q4_1_PASCAL; - const int nwarps = NWARPS_Q4_1_PASCAL; - allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q4_1_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_0_RDNA2 64 @@ -4503,11 +4201,6 @@ template static void #define NWARPS_Q5_0_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4518,22 +4211,7 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_0_RDNA2; - const int mmq_y = MMQ_Y_Q5_0_RDNA2; - const int nwarps = NWARPS_Q5_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_0_RDNA1; - const int mmq_y = MMQ_Y_Q5_0_RDNA1; - const int nwarps = NWARPS_Q5_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q5_0_AMPERE; const int mmq_y = MMQ_Y_Q5_0_AMPERE; const int nwarps = NWARPS_Q5_0_AMPERE; @@ -4544,19 +4222,6 @@ template static void vec_dot_q5_0_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_0_PASCAL; - const int mmq_y = MMQ_Y_Q5_0_PASCAL; - const int nwarps = NWARPS_Q5_0_PASCAL; - allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q5_0_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_1_RDNA2 64 @@ -4579,11 +4244,6 @@ template static void #define NWARPS_Q5_1_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4594,22 +4254,7 @@ mul_mat_q5_1( int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_1_RDNA2; - const int mmq_y = MMQ_Y_Q5_1_RDNA2; - const int nwarps = NWARPS_Q5_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_1_RDNA1; - const int mmq_y = MMQ_Y_Q5_1_RDNA1; - const int nwarps = NWARPS_Q5_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q5_1_AMPERE; const int mmq_y = MMQ_Y_Q5_1_AMPERE; const int nwarps = NWARPS_Q5_1_AMPERE; @@ -4620,19 +4265,6 @@ mul_mat_q5_1( vec_dot_q5_1_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_1_PASCAL; - const int mmq_y = MMQ_Y_Q5_1_PASCAL; - const int nwarps = NWARPS_Q5_1_PASCAL; - allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q5_1_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q8_0_RDNA2 64 @@ -4655,11 +4287,6 @@ mul_mat_q5_1( #define NWARPS_Q8_0_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4670,22 +4297,7 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q8_0_RDNA2; - const int mmq_y = MMQ_Y_Q8_0_RDNA2; - const int nwarps = NWARPS_Q8_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q8_0_RDNA1; - const int mmq_y = MMQ_Y_Q8_0_RDNA1; - const int nwarps = NWARPS_Q8_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q8_0_AMPERE; const int mmq_y = MMQ_Y_Q8_0_AMPERE; const int nwarps = NWARPS_Q8_0_AMPERE; @@ -4696,19 +4308,6 @@ template static void vec_dot_q8_0_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q8_0_PASCAL; - const int mmq_y = MMQ_Y_Q8_0_PASCAL; - const int nwarps = NWARPS_Q8_0_PASCAL; - allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q8_0_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q2_K_RDNA2 64 @@ -4731,11 +4330,6 @@ template static void #define NWARPS_Q2_K_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4747,22 +4341,7 @@ mul_mat_q2_K( int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q2_K_RDNA2; - const int mmq_y = MMQ_Y_Q2_K_RDNA2; - const int nwarps = NWARPS_Q2_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q2_K_RDNA1; - const int mmq_y = MMQ_Y_Q2_K_RDNA1; - const int nwarps = NWARPS_Q2_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q2_K_AMPERE; const int mmq_y = MMQ_Y_Q2_K_AMPERE; const int nwarps = NWARPS_Q2_K_AMPERE; @@ -4773,19 +4352,6 @@ mul_mat_q2_K( vec_dot_q2_K_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q2_K_PASCAL; - const int mmq_y = MMQ_Y_Q2_K_PASCAL; - const int nwarps = NWARPS_Q2_K_PASCAL; - allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q2_K_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q3_K_RDNA2 128 @@ -4808,14 +4374,7 @@ mul_mat_q2_K( #define NWARPS_Q3_K_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA - mul_mat_q3_K( +mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K, @@ -4826,22 +4385,7 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q3_K_RDNA2; - const int mmq_y = MMQ_Y_Q3_K_RDNA2; - const int nwarps = NWARPS_Q3_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q3_K_RDNA1; - const int mmq_y = MMQ_Y_Q3_K_RDNA1; - const int nwarps = NWARPS_Q3_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q3_K_AMPERE; const int mmq_y = MMQ_Y_Q3_K_AMPERE; const int nwarps = NWARPS_Q3_K_AMPERE; @@ -4853,19 +4397,6 @@ template static void vec_dot_q3_K_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q3_K_PASCAL; - const int mmq_y = MMQ_Y_Q3_K_PASCAL; - const int nwarps = NWARPS_Q3_K_PASCAL; - allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q3_K_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q4_K_RDNA2 64 @@ -4888,13 +4419,6 @@ template static void #define NWARPS_Q4_K_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4906,22 +4430,7 @@ template static void int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_K_RDNA2; - const int mmq_y = MMQ_Y_Q4_K_RDNA2; - const int nwarps = NWARPS_Q4_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_K_RDNA1; - const int mmq_y = MMQ_Y_Q4_K_RDNA1; - const int nwarps = NWARPS_Q4_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q4_K_AMPERE; const int mmq_y = MMQ_Y_Q4_K_AMPERE; const int nwarps = NWARPS_Q4_K_AMPERE; @@ -4932,19 +4441,6 @@ template static void vec_dot_q4_K_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_K_PASCAL; - const int mmq_y = MMQ_Y_Q4_K_PASCAL; - const int nwarps = NWARPS_Q4_K_PASCAL; - allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q4_K_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_K_RDNA2 64 @@ -4967,11 +4463,6 @@ template static void #define NWARPS_Q5_K_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -4983,22 +4474,7 @@ mul_mat_q5_K( int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_K_RDNA2; - const int mmq_y = MMQ_Y_Q5_K_RDNA2; - const int nwarps = NWARPS_Q5_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_K_RDNA1; - const int mmq_y = MMQ_Y_Q5_K_RDNA1; - const int nwarps = NWARPS_Q5_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q5_K_AMPERE; const int mmq_y = MMQ_Y_Q5_K_AMPERE; const int nwarps = NWARPS_Q5_K_AMPERE; @@ -5009,19 +4485,6 @@ mul_mat_q5_K( vec_dot_q5_K_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_K_PASCAL; - const int mmq_y = MMQ_Y_Q5_K_PASCAL; - const int nwarps = NWARPS_Q5_K_PASCAL; - allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q5_K_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q6_K_RDNA2 64 @@ -5044,13 +4507,6 @@ mul_mat_q5_K( #define NWARPS_Q6_K_PASCAL 8 template static void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, @@ -5061,22 +4517,7 @@ template static void int * tile_x_qh = nullptr; // int * tile_x_sc = nullptr; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q6_K_RDNA2; - const int mmq_y = MMQ_Y_Q6_K_RDNA2; - const int nwarps = NWARPS_Q6_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q6_K_RDNA1; - const int mmq_y = MMQ_Y_Q6_K_RDNA1; - const int nwarps = NWARPS_Q6_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); - -#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA +//sycl_todo: change according to hardware const int mmq_x = MMQ_X_Q6_K_AMPERE; const int mmq_y = MMQ_Y_Q6_K_AMPERE; const int nwarps = NWARPS_Q6_K_AMPERE; @@ -5087,19 +4528,6 @@ template static void vec_dot_q6_K_q8_1_mul_mat>( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); - -#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q6_K_PASCAL; - const int mmq_y = MMQ_Y_Q6_K_PASCAL; - const int nwarps = NWARPS_Q6_K_PASCAL; - allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); -#else - (void) vec_dot_q6_K_q8_1_mul_mat; - bad_arch(); -#endif // __CUDA_ARCH__ >= CC_VOLTA } template @@ -5167,11 +4595,11 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * const int y_offset = qr == 1 ? 1 : qk/2; // partial sum for each thread -#ifdef GGML_CUDA_F16 - half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#ifdef GGML_SYCL_F16 + sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics #else float tmp = 0.0f; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; @@ -5191,15 +4619,15 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; #else tmp += v.x() * y[iybs + iqs + j / qr + 0]; tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } } @@ -5211,11 +4639,11 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * } if (tid == 0) { -#ifdef GGML_CUDA_F16 - dst[row] = tmp.x + tmp.y; +#ifdef GGML_SYCL_F16 + dst[row] = tmp.x() + tmp.y(); #else dst[row] = tmp; -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 } } @@ -8728,7 +8156,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -#if !defined(GGML_USE_HIPBLAS) // pool with virtual memory /* DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported. @@ -8801,10 +8228,7 @@ catch (sycl::exception const &exc) { << ", line:" << __LINE__ << std::endl; std::exit(1); } -#else -#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg -#define ggml_cuda_pool_free ggml_cuda_pool_free_leg -#endif // !defined(GGML_USE_HIPBLAS) + template struct cuda_pool_alloc { @@ -8924,16 +8348,13 @@ void ggml_init_cublas() try { g_tensor_split[id] = total_vram; total_vram += prop.get_global_mem_size(); -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; -#else /* DPCT1005:87: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code. */ g_device_caps[id].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + // g_device_caps[id].cc = 9000; printf("g_device_caps[%d].cc=%d\n", id, g_device_caps[id].cc); } @@ -9555,29 +8976,6 @@ static int64_t get_row_rounding(ggml_type type) { } } -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - return max_compute_capability >= CC_RDNA2 ? 128 : 32; - case GGML_TYPE_Q3_K: - return min_compute_capability < CC_RDNA2 ? 128 : 64; - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - default: - GGML_ASSERT(false); - } -#else switch(type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -9599,7 +8997,6 @@ static int64_t get_row_rounding(ggml_type type) { default: GGML_ASSERT(false); } -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } inline void ggml_cuda_op_mul_mat_vec_q( @@ -9668,9 +9065,9 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( const int64_t row_diff = row_high - row_low; // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_CUDA_F16 - cuda_pool_alloc src1_dfloat_a; - half * src1_dfloat = nullptr; // dfloat == half +#ifdef GGML_SYCL_F16 + cuda_pool_alloc src1_dfloat_a; + sycl::half *src1_dfloat = nullptr; // dfloat == half bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || @@ -9679,13 +9076,13 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( if (src1_convert_f16) { src1_dfloat = src1_dfloat_a.alloc(ne00); - ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, - ne00, 1, sizeof(float), 0, 0, - ne00, 1, sizeof(half), 0, 0, stream); + ggml_cpy_f32_f16_cuda((const char *)src1_ddf_i, (char *)src1_dfloat, + ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1, + sizeof(sycl::half), 0, 0, stream); } #else const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_CUDA_F16 +#endif // GGML_SYCL_F16 switch (src0->type) { case GGML_TYPE_Q4_0: @@ -9761,9 +9158,11 @@ inline void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_device_caps[id].cc; + // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 - GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n"); + // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n"); cuda_pool_alloc src0_as_f16; if (src0->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); @@ -9805,7 +9204,7 @@ inline void ggml_cuda_op_mul_mat_cublas( to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); } else { - GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n"); + // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n"); cuda_pool_alloc src0_ddq_as_f32; if (src0->type != GGML_TYPE_F32) { @@ -11023,7 +10422,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 if (use_mul_mat_vec_q) { // NOTE: this kernel does not support ggml_nrows(src1) > 1 - GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n"); + // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n"); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); } else { // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n"); @@ -11037,12 +10436,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { use_mul_mat_q = false; } - // use_mul_mat_q = false;//zjy + if (use_mul_mat_q) { - GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n"); + // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n"); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { - //GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n"); + // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n"); ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } diff --git a/setup.sh b/setup.sh index c7e6d47be..b5d3809ca 100755 --- a/setup.sh +++ b/setup.sh @@ -3,5 +3,7 @@ cd build source /opt/intel/oneapi/setvars.sh #cmake .. -DLLAMA_CLBLAST=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake --build . --config Release -v