Added -no-cnv flag to force instruct models to continuously generate tokens

2025-02-03 00:13:50 -05:00 · 2025-02-03 00:13:50 -05:00 · ccfdca810e
commit ccfdca810e
parent 6eecde3cc8
1 changed files with 19 additions and 10 deletions
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -6,22 +6,31 @@

 source /opt/intel/oneapi/setvars.sh

-#export GGML_SYCL_DEBUG=1
+# export GGML_SYCL_DEBUG=1

-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+export ZES_ENABLE_SYSMAN=1
+# Enable this to allow llama.cpp to check the free memory of the GPU by using:
+# sycl::aspect::ext_intel_free_memory
+#
+# It's recommended to use this when using --split-mode=layer so that llama.cpp
+# can better optimize the distribution of layers across the CPU and GPU.

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+MODEL_FILE="models/llama-2-7b.Q4_0.gguf"
 NGL=33
-CONEXT=8192
+CONTEXT=8192

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
-    echo "use $GGML_SYCL_DEVICE as main GPU"
-    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
-
+    echo "Using ${GGML_SYCL_DEVICE} as the main GPU"
+    # Use on a single GPU
+    EXTRA_ARGS="-mg ${GGML_SYCL_DEVICE} -sm none"
 else
-    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
+    # Use on multiple processors with the same max-compute units
+    EXTRA_ARGS=""
 fi
+
+./build/bin/llama-cli -m "${MODEL_FILE}" -p "${INPUT_PROMPT}" -n 400 -no-cnv -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${EXTRA_ARGS}
+
+# The "-no-cnv" flag is to force non-base "instruct" models to continue.
+# This way, we can automatically test this prompt without interference.