llama : disable moe offloading with SYCL
This commit is contained in:
parent
bd17f27ce2
commit
ba5b5467d1
2 changed files with 8 additions and 1 deletions
|
@ -17752,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|||
|
||||
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
|
|
|
@ -4495,6 +4495,13 @@ static bool llm_load_tensors(
|
|||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
// disable MoE with SYCL until mul_mat_id is updated
|
||||
if (hparams.n_expert > 0) {
|
||||
n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
model.split_mode = split_mode;
|
||||
model.main_gpu = main_gpu;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue