Merge 165edb3b2a
into d7b31a9d84
This commit is contained in:
commit
e50c074c70
3 changed files with 64 additions and 8 deletions
|
@ -71,6 +71,62 @@
|
|||
#define GGML_CUDA_CC_QY1 210
|
||||
#define GGML_CUDA_CC_QY2 220
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
constexpr bool ggml_cuda_has_arch_impl(int) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class ... Archs>
|
||||
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
|
||||
return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
|
||||
}
|
||||
|
||||
constexpr bool ggml_cuda_has_arch(const int arch) {
|
||||
return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
|
||||
}
|
||||
|
||||
static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
switch (arch) {
|
||||
case 1200: if (ggml_cuda_has_arch(1200)) return 1200; [[fallthrough]];
|
||||
case 1010: if (ggml_cuda_has_arch(1010)) return 1010; [[fallthrough]];
|
||||
case 1000: if (ggml_cuda_has_arch(1000)) return 1000; [[fallthrough]];
|
||||
case 900: if (ggml_cuda_has_arch( 900)) return 900; [[fallthrough]];
|
||||
case 890: if (ggml_cuda_has_arch( 890)) return 890; [[fallthrough]];
|
||||
case 870: if (ggml_cuda_has_arch( 870)) return 870; [[fallthrough]];
|
||||
case 860: if (ggml_cuda_has_arch( 860)) return 860; [[fallthrough]];
|
||||
case 800: if (ggml_cuda_has_arch( 800)) return 800; [[fallthrough]];
|
||||
case 750: if (ggml_cuda_has_arch( 750)) return 750; [[fallthrough]];
|
||||
case 720: if (ggml_cuda_has_arch( 720)) return 720; [[fallthrough]];
|
||||
case 700: if (ggml_cuda_has_arch( 700)) return 700; [[fallthrough]];
|
||||
case 620: if (ggml_cuda_has_arch( 620)) return 620; [[fallthrough]];
|
||||
case 610: if (ggml_cuda_has_arch( 610)) return 610; [[fallthrough]];
|
||||
case 600: if (ggml_cuda_has_arch( 600)) return 600; [[fallthrough]];
|
||||
case 530: if (ggml_cuda_has_arch( 530)) return 530; [[fallthrough]];
|
||||
case 520: if (ggml_cuda_has_arch( 520)) return 520; [[fallthrough]];
|
||||
case 500: if (ggml_cuda_has_arch( 500)) return 500; [[fallthrough]];
|
||||
case 370: if (ggml_cuda_has_arch( 370)) return 370; [[fallthrough]];
|
||||
case 350: if (ggml_cuda_has_arch( 350)) return 350; [[fallthrough]];
|
||||
case 320: if (ggml_cuda_has_arch( 320)) return 320; [[fallthrough]];
|
||||
case 300: if (ggml_cuda_has_arch( 300)) return 300; [[fallthrough]];
|
||||
case 210: if (ggml_cuda_has_arch( 210)) return 210; [[fallthrough]];
|
||||
case 200: if (ggml_cuda_has_arch( 200)) return 200; [[fallthrough]];
|
||||
case 130: if (ggml_cuda_has_arch( 130)) return 130; [[fallthrough]];
|
||||
case 120: if (ggml_cuda_has_arch( 120)) return 120; [[fallthrough]];
|
||||
case 110: if (ggml_cuda_has_arch( 110)) return 110; [[fallthrough]];
|
||||
case 100: if (ggml_cuda_has_arch( 100)) return 100;
|
||||
GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
|
||||
|
||||
default: GGML_ABORT("unknown CUDA arch: %d", arch);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
return arch;
|
||||
}
|
||||
#endif // __CUDA_ARCH_LIST__
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
@ -162,18 +218,18 @@ typedef float2 dfloat2;
|
|||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
||||
|
||||
static constexpr bool fast_fp16_available(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
||||
static bool fast_fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
// Any FP16 tensor cores are available.
|
||||
static constexpr bool fp16_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static constexpr bool new_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
||||
static bool new_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
|
|
|
@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (cc < GGML_CUDA_CC_DP4A) {
|
||||
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ struct tile_x_sizes {
|
|||
int sc;
|
||||
};
|
||||
|
||||
static constexpr int get_mmq_x_max_host(const int cc) {
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return new_mma_available(cc) ? 128 :
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue