refactor fp16 logic, only consider used devices

This commit is contained in:
JohannesGaessler 2024-02-11 18:52:49 +01:00
parent 76a0128bec
commit 005de593ad

View file

@ -9914,13 +9914,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
int64_t min_compute_capability = INT_MAX;
bool any_pascal_with_slow_fp16 = false;
if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split;
for (int id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
// skip devices that are not going to do any work:
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
continue;
}
if (min_compute_capability > g_device_caps[id].cc) {
min_compute_capability = g_device_caps[id].cc;
}
if (g_device_caps[id].cc == 610) {
any_pascal_with_slow_fp16 = true;
}
}
} else {
min_compute_capability = g_device_caps[g_main_device].cc;
@ -9949,13 +9958,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
#else
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
bool fp16_performance_good = true;
for (int id = 0; id < g_device_count; ++id) {
if (g_device_caps[id].cc < CC_VOLTA && g_device_caps[id].cc != CC_PASCAL) {
fp16_performance_good = false;
break;
}
}
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;