llama : disable moe offloading with SYCL

2024-04-18 15:16:43 +02:00 · 2024-04-18 15:16:43 +02:00 · ba5b5467d1
commit ba5b5467d1
parent bd17f27ce2
2 changed files with 8 additions and 1 deletions
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -17752,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
 GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
    GGML_UNUSED(backend);
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -4495,6 +4495,13 @@ static bool llm_load_tensors(
    auto & hparams = model.hparams;
 #ifdef GGML_USE_SYCL
    // disable MoE with SYCL until mul_mat_id is updated
    if (hparams.n_expert > 0) {
        n_gpu_layers = 0;
    }
 #endif
    model.split_mode   = split_mode;
    model.main_gpu     = main_gpu;
    model.n_gpu_layers = n_gpu_layers;