llama : add basic support for offloading moe with CUDA
This commit is contained in:
parent
2cbcba829f
commit
06dfde3e94
3 changed files with 61 additions and 19 deletions
1
ggml.c
1
ggml.c
|
@ -4105,7 +4105,6 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|||
result->src[0] = ids;
|
||||
result->src[1] = b;
|
||||
|
||||
// TODO: n_as is the selected experts, but it should be the total number of experts
|
||||
for (int i = 0; i < n_as; i++) {
|
||||
struct ggml_tensor * a = as[i];
|
||||
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue