llama : add basic support for offloading moe with CUDA

This commit is contained in:
slaren 2023-12-09 13:21:09 +01:00
parent 2cbcba829f
commit 06dfde3e94
3 changed files with 61 additions and 19 deletions

1
ggml.c
View file

@ -4105,7 +4105,6 @@ struct ggml_tensor * ggml_mul_mat_id(
result->src[0] = ids;
result->src[1] = b;
// TODO: n_as is the selected experts, but it should be the total number of experts
for (int i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));