CUDA: use arch list for feature availability check

This commit is contained in:
Johannes Gäßler 2025-02-09 19:30:15 +01:00
parent 19d3c8293b
commit 165edb3b2a
3 changed files with 64 additions and 8 deletions

View file

@ -71,6 +71,62 @@
#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
#ifdef __CUDA_ARCH_LIST__
constexpr bool ggml_cuda_has_arch_impl(int) {
return false;
}
template<class ... Archs>
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
}
constexpr bool ggml_cuda_has_arch(const int arch) {
return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
}
static int ggml_cuda_highest_compiled_arch(const int arch) {
switch (arch) {
case 1200: if (ggml_cuda_has_arch(1200)) return 1200; [[fallthrough]];
case 1010: if (ggml_cuda_has_arch(1010)) return 1010; [[fallthrough]];
case 1000: if (ggml_cuda_has_arch(1000)) return 1000; [[fallthrough]];
case 900: if (ggml_cuda_has_arch( 900)) return 900; [[fallthrough]];
case 890: if (ggml_cuda_has_arch( 890)) return 890; [[fallthrough]];
case 870: if (ggml_cuda_has_arch( 870)) return 870; [[fallthrough]];
case 860: if (ggml_cuda_has_arch( 860)) return 860; [[fallthrough]];
case 800: if (ggml_cuda_has_arch( 800)) return 800; [[fallthrough]];
case 750: if (ggml_cuda_has_arch( 750)) return 750; [[fallthrough]];
case 720: if (ggml_cuda_has_arch( 720)) return 720; [[fallthrough]];
case 700: if (ggml_cuda_has_arch( 700)) return 700; [[fallthrough]];
case 620: if (ggml_cuda_has_arch( 620)) return 620; [[fallthrough]];
case 610: if (ggml_cuda_has_arch( 610)) return 610; [[fallthrough]];
case 600: if (ggml_cuda_has_arch( 600)) return 600; [[fallthrough]];
case 530: if (ggml_cuda_has_arch( 530)) return 530; [[fallthrough]];
case 520: if (ggml_cuda_has_arch( 520)) return 520; [[fallthrough]];
case 500: if (ggml_cuda_has_arch( 500)) return 500; [[fallthrough]];
case 370: if (ggml_cuda_has_arch( 370)) return 370; [[fallthrough]];
case 350: if (ggml_cuda_has_arch( 350)) return 350; [[fallthrough]];
case 320: if (ggml_cuda_has_arch( 320)) return 320; [[fallthrough]];
case 300: if (ggml_cuda_has_arch( 300)) return 300; [[fallthrough]];
case 210: if (ggml_cuda_has_arch( 210)) return 210; [[fallthrough]];
case 200: if (ggml_cuda_has_arch( 200)) return 200; [[fallthrough]];
case 130: if (ggml_cuda_has_arch( 130)) return 130; [[fallthrough]];
case 120: if (ggml_cuda_has_arch( 120)) return 120; [[fallthrough]];
case 110: if (ggml_cuda_has_arch( 110)) return 110; [[fallthrough]];
case 100: if (ggml_cuda_has_arch( 100)) return 100;
GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
default: GGML_ABORT("unknown CUDA arch: %d", arch);
}
}
#else
static int ggml_cuda_highest_compiled_arch(const int arch) {
return arch;
}
#endif // __CUDA_ARCH_LIST__
// ---------------------------------------------------------------------------------------------------------
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
@ -162,18 +218,18 @@ typedef float2 dfloat2;
#define FLASH_ATTN_AVAILABLE
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
static constexpr bool fast_fp16_available(const int cc) {
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
static bool fast_fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL && cc != 610;
}
// Any FP16 tensor cores are available.
static constexpr bool fp16_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
static bool fp16_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
static constexpr bool new_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
static bool new_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
}
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {

View file

@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true;
}
if (cc < GGML_CUDA_CC_DP4A) {
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
return false;
}

View file

@ -86,7 +86,7 @@ struct tile_x_sizes {
int sc;
};
static constexpr int get_mmq_x_max_host(const int cc) {
static int get_mmq_x_max_host(const int cc) {
return new_mma_available(cc) ? 128 :
#ifdef GGML_CUDA_FORCE_MMQ
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;