diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 174916bc9..52ee69fff 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -71,6 +71,62 @@ #define GGML_CUDA_CC_QY1 210 #define GGML_CUDA_CC_QY2 220 +#ifdef __CUDA_ARCH_LIST__ +constexpr bool ggml_cuda_has_arch_impl(int) { + return false; +} + +template +constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) { + return arch == first || ggml_cuda_has_arch_impl(arch, rest...); +} + +constexpr bool ggml_cuda_has_arch(const int arch) { + return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__); +} + +static int ggml_cuda_highest_compiled_arch(const int arch) { + switch (arch) { + case 1200: if (ggml_cuda_has_arch(1200)) return 1200; [[fallthrough]]; + case 1010: if (ggml_cuda_has_arch(1010)) return 1010; [[fallthrough]]; + case 1000: if (ggml_cuda_has_arch(1000)) return 1000; [[fallthrough]]; + case 900: if (ggml_cuda_has_arch( 900)) return 900; [[fallthrough]]; + case 890: if (ggml_cuda_has_arch( 890)) return 890; [[fallthrough]]; + case 870: if (ggml_cuda_has_arch( 870)) return 870; [[fallthrough]]; + case 860: if (ggml_cuda_has_arch( 860)) return 860; [[fallthrough]]; + case 800: if (ggml_cuda_has_arch( 800)) return 800; [[fallthrough]]; + case 750: if (ggml_cuda_has_arch( 750)) return 750; [[fallthrough]]; + case 720: if (ggml_cuda_has_arch( 720)) return 720; [[fallthrough]]; + case 700: if (ggml_cuda_has_arch( 700)) return 700; [[fallthrough]]; + case 620: if (ggml_cuda_has_arch( 620)) return 620; [[fallthrough]]; + case 610: if (ggml_cuda_has_arch( 610)) return 610; [[fallthrough]]; + case 600: if (ggml_cuda_has_arch( 600)) return 600; [[fallthrough]]; + case 530: if (ggml_cuda_has_arch( 530)) return 530; [[fallthrough]]; + case 520: if (ggml_cuda_has_arch( 520)) return 520; [[fallthrough]]; + case 500: if (ggml_cuda_has_arch( 500)) return 500; [[fallthrough]]; + case 370: if (ggml_cuda_has_arch( 370)) return 370; [[fallthrough]]; + case 350: if (ggml_cuda_has_arch( 350)) return 350; [[fallthrough]]; + case 320: if (ggml_cuda_has_arch( 320)) return 320; [[fallthrough]]; + case 300: if (ggml_cuda_has_arch( 300)) return 300; [[fallthrough]]; + case 210: if (ggml_cuda_has_arch( 210)) return 210; [[fallthrough]]; + case 200: if (ggml_cuda_has_arch( 200)) return 200; [[fallthrough]]; + case 130: if (ggml_cuda_has_arch( 130)) return 130; [[fallthrough]]; + case 120: if (ggml_cuda_has_arch( 120)) return 120; [[fallthrough]]; + case 110: if (ggml_cuda_has_arch( 110)) return 110; [[fallthrough]]; + case 100: if (ggml_cuda_has_arch( 100)) return 100; + GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch); + + default: GGML_ABORT("unknown CUDA arch: %d", arch); + } +} +#else +static int ggml_cuda_highest_compiled_arch(const int arch) { + return arch; +} +#endif // __CUDA_ARCH_LIST__ + +// --------------------------------------------------------------------------------------------------------- + #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #if defined(_MSC_VER) @@ -162,18 +218,18 @@ typedef float2 dfloat2; #define FLASH_ATTN_AVAILABLE #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) -static constexpr bool fast_fp16_available(const int cc) { - return cc >= GGML_CUDA_CC_PASCAL && cc != 610; +static bool fast_fp16_available(const int cc) { + return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL && cc != 610; } // Any FP16 tensor cores are available. -static constexpr bool fp16_mma_available(const int cc) { - return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA; +static bool fp16_mma_available(const int cc) { + return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. -static constexpr bool new_mma_available(const int cc) { - return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; +static bool new_mma_available(const int cc) { + return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; } static constexpr __device__ int ggml_cuda_get_physical_warp_size() { diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 45212f66c..f306a5709 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return true; } - if (cc < GGML_CUDA_CC_DP4A) { + if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) { return false; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 7a2c4d85b..1328fbd34 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -86,7 +86,7 @@ struct tile_x_sizes { int sc; }; -static constexpr int get_mmq_x_max_host(const int cc) { +static int get_mmq_x_max_host(const int cc) { return new_mma_available(cc) ? 128 : #ifdef GGML_CUDA_FORCE_MMQ cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;