diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index f5bb7da86..a9b310243 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17752,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; - return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; GGML_UNUSED(backend); } diff --git a/llama.cpp b/llama.cpp index 489b12028..8f889def3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4495,6 +4495,13 @@ static bool llm_load_tensors( auto & hparams = model.hparams; +#ifdef GGML_USE_SYCL + // disable MoE with SYCL until mul_mat_id is updated + if (hparams.n_expert > 0) { + n_gpu_layers = 0; + } +#endif + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers;