cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
This commit is contained in:
slaren 2023-12-26 21:23:59 +01:00 committed by GitHub
parent de8e496437
commit dc68f0054c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 243 additions and 246 deletions

3
ggml.c
View file

@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
result->op = GGML_OP_GROUP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL; // TODO: maybe store epsilon here?
return result;
}
@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
result->op_params[0] = scale_factor;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL;
return result;
}
@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
result->op = GGML_OP_GET_REL_POS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL;
return result;
}