ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id cuda : improve mmid row copy * cuda : fix bin bcast with non-cont src0 * test-backend-ops : only run all mul mat tests for base types * llama : disable moe offloading with SYCL --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-04-18 15:18:48 +02:00 · 2024-04-18 15:18:48 +02:00 · 0d56246f4b
commit 0d56246f4b
parent 03c0946d73
12 changed files with 971 additions and 821 deletions
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -12,19 +12,7 @@ bench_args="${@:3}"

 rm -f llama-bench.sqlite

-backend="cpu"
-
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-make_opts=""
-
-if [[ "$backend" == "cuda" ]]; then
-    make_opts="LLAMA_CUDA=1"
-fi
+# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)

 git checkout $1
 make clean && make -j32 $make_opts llama-bench