From f80ff4dc6a545f84e0ea949b0b70c38ffa166c40 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 01:43:27 -0500
Subject: [PATCH] ci : get ci/run.sh working with test-model-load-cancel

---
 ci/run.sh                        | 247 +++++++++++++++++--------------
 requirements.txt                 |   8 +-
 tests/test-model-load-cancel.cpp |  36 +++--
 3 files changed, 167 insertions(+), 124 deletions(-)
diff --git a/ci/run.sh b/ci/run.sh
index 2fb214299..a1b978a0f 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -11,6 +11,8 @@
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 
+set -u # Fail on unset variables
+
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
     exit 1
@@ -30,8 +32,20 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
+# Read-only array of quantization types for iteration.
+# Use ${quants[@]:1} to skip f16.
+declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k )
+
 ## helpers
 
+# Print an error message to stderr and exit with an error.
+# usage: die <format-string> <format-args>
+function die {
+    local format="$1"; shift
+    >&2 printf "$format" "$@"
+    exit 1
+}
+
 # download a file if it does not exist or if it is outdated
 function gg_wget {
     local out=$1
@@ -77,7 +91,9 @@ function gg_run {
 function gg_run_ctest_debug {
     cd ${SRC}
 
-    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+    rm -rf build-ci-debug
+    mkdir build-ci-debug
+    cd build-ci-debug
 
     set -e
 
@@ -105,14 +121,16 @@ function gg_sum_ctest_debug {
 function gg_run_ctest_release {
     cd ${SRC}
 
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    rm -rf build-ci-release
+    mkdir build-ci-release
+    cd build-ci-release
 
     set -e
 
     (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then
         (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
         (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -133,6 +151,7 @@ function gg_sum_ctest_release {
 
 function gg_run_ctest_with_model {
     cd ${SRC}
+    cd build-ci-release
     set -e
     (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log
     set +e
@@ -151,81 +170,70 @@ function gg_sum_ctest_with_model {
 # open_llama_3b_v2
 
 function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
+    # We use absolute paths here to not have to track CWD as much
+    local models_mnt="$(realpath "${SRC}/models-mnt")"
+    local path_models="${models_mnt}/open-llama/3B-v2"
+    local path_wiki="${models_mnt}/wikitext"
+    local path_wiki_raw="${path_wiki}/wikitext-2-raw"
 
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+    mkdir -p "${path_models}" "${path_wiki}"
 
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
 
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    gg_wget "${path_wiki}"  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}"
+    head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw"
 
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    rm -rf "${SRC}/build-ci-release"
+    mkdir "${SRC}/build-ci-release"
+    cd "${SRC}/build-ci-release"
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log"
+    (time make -j                                              ) 2>&1 | tee -a "${OUT}/${ci}-make.log"
 
-    python3 ../convert.py ${path_models}
+    python3 "${SRC}/convert.py" "${path_models}"
 
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    # Get the model path for a quantization
+    # usage: model_for <quant>
+    function model_for {
+        if (( $# != 1 )); then
+            die 'model_for takes a single quantization, such as q8_0'
+        fi
+        echo -n "${path_models}/ggml-model-$1.gguf"
+    }
 
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+    wiki_test_60="${path_wiki_raw}/wiki.test-60.raw"
 
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+     # Quantize q8_0 through q6_k
+    for q in "${quants[@]:1}"; do
+        ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}"
+    done
 
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    # Run basic inference for all quants
+    for q in "${quants[@]}"; do
+        ( time \
+            ./bin/main --model "$(model_for "${q}")"  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is"
+        ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log"
+    done
 
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    # Run perplexity with wiki_test_60
+    for q in "${quants[@]}"; do
+        ( time \
+            ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2
+        ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    done
 
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    # Run examples/save-load-state with q4_0
+    ( time \
+        ./bin/save-load-state --model "$(model_for q4_0)"
+    ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log"
 
     function check_ppl {
         qnt="$1"
@@ -240,17 +248,11 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    # Check perplexity results for all quants
+    for q in "${quants[@]}"; do
+        check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log"  | grep "^\[1\]")" \
+            | tee -a "${OUT}/${ci}-ppl.log"
+    done
 
     # lora
     function compare_ppl {
@@ -267,32 +269,42 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
+    local path_lora="${path_models}/lora"
+    local path_shakespeare="${models_mnt}/shakespeare"
 
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+    local shakespeare="${path_shakespeare}/shakespeare.txt"
+    local lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
 
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
 
-    python3 ../convert-lora-to-ggml.py ${path_lora}
+    python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}"
 
     # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log"
+    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log"
+    compare_ppl "f16 shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log"
+    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log"
+    compare_ppl "q8_0 shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
+    ( time \
+        ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2
+    ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log"
+    compare_ppl "q8_0 / f16 base shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     set +e
 }
@@ -502,31 +514,42 @@ function gg_sum_open_llama_7b_v2 {
 
 ## main
 
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    rm -rf ${SRC}/models-mnt
-
-    mnt_models=${MNT}/models
-    mkdir -p ${mnt_models}
-    ln -sfn ${mnt_models} ${SRC}/models-mnt
-
-    python3 -m pip install -r ${SRC}/requirements.txt
-    python3 -m pip install --editable gguf-py
-fi
-
 ret=0
 
+# This is necessary to test if a variable is set while `set -u` is enabled.
+# see: https://stackoverflow.com/a/13864829
+# [[ -z ${var+x} ]]   evaluates to false if var is set
+# [[ ! -z ${var+x} ]] evaluates to true  if var is set
+if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then
+    test "${ret}" -eq 0 && gg_run ctest_debug
+    test "${ret}" -eq 0 && gg_run ctest_release
+    exit "${ret}"
+fi # Otherwise, do extended testing
+
+rm -rf ${SRC}/models-mnt
+
+mnt_models=${MNT}/models
+mkdir -p ${mnt_models}
+ln -sfn ${mnt_models} ${SRC}/models-mnt
+
+# Create a fresh python3 venv and enter it
+python3 -m venv "${MNT}/venv"
+source "${MNT}/venv/bin/activate"
+
+pip install --disable-pip-version-check -r ${SRC}/requirements.txt
+pip install --disable-pip-version-check --editable gguf-py
+
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
-        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-        test $ret -eq 0 && gg_run ctest_with_model
+# Run tests with open_llama
+if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then
+    if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then
+        test $ret -eq 0 && gg_run open_llama_7b_v2
+    else
+        test $ret -eq 0 && gg_run open_llama_3b_v2
     fi
+    test $ret -eq 0 && gg_run ctest_with_model
 fi
 
 exit $ret
diff --git a/requirements.txt b/requirements.txt
index badfec3be..35713223f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
-numpy==1.24.4
-sentencepiece==0.1.98
-transformers>=4.34.0
 gguf>=0.1.0
+numpy==1.24.4
+protobuf==4.25.1
+sentencepiece==0.1.98
+torch==2.0.1
+transformers>=4.34.0
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index 926a305da..509f3e8e0 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -2,27 +2,45 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <string>
 
 int main(void) {
-    const auto * model_path = "models/7B/ggml-model-f16.gguf";
-    auto * file = fopen(model_path, "r");
+    const char * models_to_try[] = {
+        // Same default as example/main for local use
+        "./models/7B/ggml-model-f16.gguf",
+        // Models for ./ci/run.sh
+        "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf",
+        "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf",
+    };
 
-    if (file == nullptr) {
-        fprintf(stderr, "no model at '%s' found\n", model_path);
-        return EXIT_FAILURE;
+    const char * chosen_model;
+    for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) {
+        const auto * model = models_to_try[i];
+
+        auto * file = fopen(model, "r");
+        if (file == nullptr) {
+            continue;
+        }
+
+        chosen_model = model;
+        fprintf(stderr, "using '%s'\n", model);
+        fclose(file);
     }
 
-    fprintf(stderr, "using '%s'\n", model_path);
-    fclose(file);
+    if (chosen_model == nullptr) {
+        fprintf(stderr, "no model found\n");
+        return EXIT_FAILURE;
+    }
 
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
     params.progress_callback = [](float progress, void * ctx){
         (void) ctx;
-        return progress > 0.50;
+        return progress > 0.05;
     };
-    auto * model = llama_load_model_from_file(model_path, params);
+
+    auto * model = llama_load_model_from_file(chosen_model, params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }