From f607e5325214a2c10f8db772061f521f4e7ac7ee Mon Sep 17 00:00:00 2001 From: crasm Date: Fri, 22 Dec 2023 00:58:32 -0500 Subject: [PATCH] reset to upstream/master --- .github/workflows/build.yml | 12 +- .../workflows/python-check-requirements.yml | 27 -- .gitignore | 16 + Makefile | 4 - check-requirements.sh | 156 ---------- ci/run.sh | 276 ++++++++---------- convert-persimmon-to-gguf.py | 1 - llama.cpp | 46 +-- llama.h | 6 +- requirements-convert-llama-ggml-to-gguf.txt | 1 - requirements-convert-lora-to-ggml.txt | 2 - requirements-convert-persimmon-to-gguf.txt | 2 - requirements-convert.txt | 5 - ...to-gguf.txt => requirements-hf-to-gguf.txt | 2 +- requirements.txt | 16 +- tests/.gitignore | 2 - tests/CMakeLists.txt | 8 - tests/test-model-load-cancel.cpp | 46 --- 18 files changed, 160 insertions(+), 468 deletions(-) delete mode 100644 .github/workflows/python-check-requirements.yml delete mode 100755 check-requirements.sh mode change 100755 => 100644 convert-persimmon-to-gguf.py delete mode 100644 requirements-convert-llama-ggml-to-gguf.txt delete mode 100644 requirements-convert-lora-to-ggml.txt delete mode 100644 requirements-convert-persimmon-to-gguf.txt delete mode 100644 requirements-convert.txt rename requirements-convert-hf-to-gguf.txt => requirements-hf-to-gguf.txt (54%) delete mode 100644 tests/.gitignore delete mode 100644 tests/test-model-load-cancel.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1d8741940..a5090e398 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -107,7 +107,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -141,7 +141,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose + ctest --verbose # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. @@ -202,7 +202,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -394,7 +394,7 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 run: | cd build - ctest -L main -C Release --verbose --timeout 900 + ctest -C Release --verbose --timeout 900 - name: Test (Intel SDE) id: cmake_test_sde @@ -406,7 +406,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - & $sde -future -- ctest -L main -C Release --verbose --timeout 900 + & $sde -future -- ctest -C Release --verbose --timeout 900 - name: Determine tag name id: tag diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml deleted file mode 100644 index cc97ee810..000000000 --- a/.github/workflows/python-check-requirements.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Python check requirements.txt - -on: - push: - paths: - - 'check-requirements.sh' - - 'convert*.py' - - 'requirements*.txt' - pull_request: - paths: - - 'check-requirements.sh' - - 'convert*.py' - - 'requirements*.txt' - -jobs: - python-check-requirements: - runs-on: ubuntu-latest - name: check-requirements - steps: - - name: Check out source repository - uses: actions/checkout@v3 - - name: Set up Python environment - uses: actions/setup-python@v4 - with: - python-version: "3.11" - - name: Run check-requirements.sh script - run: bash check-requirements.sh nocleanup diff --git a/.gitignore b/.gitignore index 7b1a9f9e3..76b3d2861 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,19 @@ examples/jeopardy/results.txt poetry.lock poetry.toml + +# Test binaries +/tests/test-grammar-parser +/tests/test-llama-grammar +/tests/test-double-float +/tests/test-grad0 +/tests/test-opt +/tests/test-quantize-fns +/tests/test-quantize-perf +/tests/test-sampling +/tests/test-tokenizer-0-llama +/tests/test-tokenizer-0-falcon +/tests/test-tokenizer-1-llama +/tests/test-tokenizer-1-bpe +/tests/test-rope +/tests/test-backend-ops diff --git a/Makefile b/Makefile index b5ce2e2da..68df7702a 100644 --- a/Makefile +++ b/Makefile @@ -10,8 +10,6 @@ TEST_TARGETS = \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-backend-ops -# # TODO(crasm): determine how to run tests that depend on openllama model files with make - # tests/test-model-load-cancel # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -732,5 +730,3 @@ tests/test-c.o: tests/test-c.c llama.h tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) - -tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS) diff --git a/check-requirements.sh b/check-requirements.sh deleted file mode 100755 index ac6beb604..000000000 --- a/check-requirements.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# -# check-requirements.sh checks all requirements files for each top-level -# convert*.py script. -# -# WARNING: This is quite IO intensive, because a fresh venv is set up for every -# python script. -# -# usage: ./check-requirements.sh [] -# ./check-requirements.sh 'nocleanup' [] -# -# where: -# - is a directory that can be used as the base for -# setting up the venvs. Defaults to `/tmp`. -# - 'nocleanup' as the first argument will disable automatic cleanup -# of the files created by this script. -# -# requires: -# - bash >= 3.2.57 -# - shellcheck -# -# For each script, it creates a fresh venv, `pip install -r` the -# requirements, and finally executes the python script with no arguments to -# check for a `ModuleNotFoundError`. -# - -log() { - local level="$1"; shift - local format="$1"; shift - # shellcheck disable=SC2059 - >&2 printf "$level: $format\n" "$@" -} - -info() { - log 'INFO' "$@" -} - -fatal() { - log 'FATAL' "$@" - exit 1 -} - -cleanup() { - if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then - info "Removing $workdir" - ( - count=0 - rm -rfv "$workdir" | while read -r; do - if (( count++ > 750 )); then - printf '.' - count=0 - fi - done - printf '\n' - )& - wait $! - info "Removed '$workdir'" - fi -} - -abort() { - cleanup - exit 1 -} - -if [[ $1 == nocleanup ]]; then - shift # discard nocleanup arg -else - trap abort SIGINT SIGTERM SIGQUIT SIGABRT - trap cleanup EXIT -fi - -set -eu -o pipefail -this="$(realpath "$0")" -readonly this -cd "$(dirname "$this")" - -shellcheck "$this" - -workdir= -if [[ -n ${1+x} ]]; then - arg_dir="$(realpath "$1")" - if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then - fatal "$arg_dir is not a valid directory" - fi - workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")" -else - workdir="$(mktemp -d "/tmp/check-requirements.XXXX")" -fi -readonly workdir - -info "Working directory: $workdir" - -assert_arg_count() { - local argcount="$1"; shift - if (( $# != argcount )); then - fatal "${FUNCNAME[1]}: incorrect number of args" - fi -} - -check_requirements() { - assert_arg_count 2 "$@" - local venv="$1" - local reqs="$2" - - info "$reqs: beginning check" - ( - # shellcheck source=/dev/null - source "$venv/bin/activate" - pip --disable-pip-version-check install -q -r "$reqs" - ) - info "$reqs: OK" -} - -check_convert_script() { - assert_arg_count 1 "$@" - local py="$1" - local pyname="${py%.py}" - - info "$py: beginning check" - - local reqs="requirements-$pyname.txt" - if [[ ! -r "$reqs" ]]; then - fatal "$py missing requirements. Expected: $reqs" - fi - - local venv="$workdir/$pyname-venv" - python3 -m venv "$venv" - - check_requirements "$venv" "$reqs" - set +e - ( - # shellcheck source=/dev/null - source "$venv/bin/activate" - py_err="$workdir/$pyname.out" - python "$py" 2> "$py_err" - >&2 cat "$py_err" - grep -e 'ModuleNotFoundError' "$py_err" - ) - set -e - # shellcheck disable=SC2181 - (( $? )) && fatal "$py: some imports not declared in $reqs" - info "$py: imports OK" -} - -# Check requirements.txt -all_venv="$workdir/all-venv" -python3 -m venv "$all_venv" -check_requirements "$all_venv" 'requirements.txt' - -check_convert_script 'convert.py' -for py in convert-*.py; do - check_convert_script "$py" -done - -info "Done! No issues found." diff --git a/ci/run.sh b/ci/run.sh index 9c2b4b3cf..2e3343831 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#/bin/bash # # sample usage: # @@ -11,8 +11,6 @@ # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # -set -u # Fail on unset variables - if [ -z "$2" ]; then echo "usage: $0 " exit 1 @@ -24,28 +22,16 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -fv $OUT/*.log -rm -fv $OUT/*.exit -rm -fv $OUT/*.md +rm -v $OUT/*.log +rm -v $OUT/*.exit +rm -v $OUT/*.md sd=`dirname $0` cd $sd/../ SRC=`pwd` -# Read-only array of quantization types for iteration. -# Use ${quants[@]:1} to skip f16. -declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k ) - ## helpers -# Print an error message to stderr and exit with an error. -# usage: die -function die { - local format="$1"; shift - >&2 printf "$format" "$@" - exit 1 -} - # download a file if it does not exist or if it is outdated function gg_wget { local out=$1 @@ -91,16 +77,14 @@ function gg_run { function gg_run_ctest_debug { cd ${SRC} - rm -rf build-ci-debug - mkdir build-ci-debug - cd build-ci-debug + rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug set -e (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -121,19 +105,17 @@ function gg_sum_ctest_debug { function gg_run_ctest_release { cd ${SRC} - rm -rf build-ci-release - mkdir build-ci-release - cd build-ci-release + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release set -e (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then - (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log + if [ -z ${GG_BUILD_LOW_PERF} ]; then + (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e @@ -149,91 +131,84 @@ function gg_sum_ctest_release { gg_printf '```\n' } -function gg_run_ctest_with_model { - cd ${SRC} - cd build-ci-release - set -e - (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log - set +e -} - -function gg_sum_ctest_with_model { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'Runs ctest with model files\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '```\n' - gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)" - gg_printf '```\n' -} - # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { - # We use absolute paths here to not have to track CWD as much - local models_mnt="$(realpath "${SRC}/models-mnt")" - local path_models="${models_mnt}/open-llama/3B-v2" - local path_wiki="${models_mnt}/wikitext" - local path_wiki_raw="${path_wiki}/wikitext-2-raw" + cd ${SRC} - mkdir -p "${path_models}" "${path_wiki}" + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json + gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw - gg_wget "${path_wiki}" https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip - unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}" - head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw" + path_models="../models-mnt/open-llama/3B-v2" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" - rm -rf "${SRC}/build-ci-release" - mkdir "${SRC}/build-ci-release" - cd "${SRC}/build-ci-release" + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log" - (time make -j ) 2>&1 | tee -a "${OUT}/${ci}-make.log" + (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 "${SRC}/convert.py" "${path_models}" + python3 ../convert.py ${path_models} - # Get the model path for a quantization - # usage: model_for - function model_for { - if (( $# != 1 )); then - die 'model_for takes a single quantization, such as q8_0' - fi - echo -n "${path_models}/ggml-model-$1.gguf" - } + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" - wiki_test_60="${path_wiki_raw}/wiki.test-60.raw" + wiki_test_60="${path_wiki}/wiki.test-60.raw" - # Quantize q8_0 through q6_k - for q in "${quants[@]:1}"; do - ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}" - done + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q2_k} q2_k + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k - # Run basic inference for all quants - for q in "${quants[@]}"; do - ( time \ - ./bin/main --model "$(model_for "${q}")" -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" - ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log" - done + (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - # Run perplexity with wiki_test_60 - for q in "${quants[@]}"; do - ( time \ - ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2 - ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - done + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - # Run examples/save-load-state with q4_0 - ( time \ - ./bin/save-load-state --model "$(model_for q4_0)" - ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log" + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -248,11 +223,17 @@ function gg_run_open_llama_3b_v2 { return 0 } - # Check perplexity results for all quants - for q in "${quants[@]}"; do - check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-ppl.log" - done + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # lora function compare_ppl { @@ -269,42 +250,32 @@ function gg_run_open_llama_3b_v2 { return 0 } - local path_lora="${path_models}/lora" - local path_shakespeare="${models_mnt}/shakespeare" + path_lora="../models-mnt/open-llama/3B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" - local shakespeare="${path_shakespeare}/shakespeare.txt" - local lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" - gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json - gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin - gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt - python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}" + python3 ../convert-lora-to-ggml.py ${path_lora} # f16 - (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log" - (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" - compare_ppl "f16 shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 - (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log" - (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log" - compare_ppl "q8_0 shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 + f16 lora-base - ( time \ - ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2 - ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" - compare_ppl "q8_0 / f16 base shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + set +e } @@ -514,43 +485,30 @@ function gg_sum_open_llama_7b_v2 { ## main +if [ -z ${GG_BUILD_LOW_PERF} ]; then + rm -rf ${SRC}/models-mnt + + mnt_models=${MNT}/models + mkdir -p ${mnt_models} + ln -sfn ${mnt_models} ${SRC}/models-mnt + + python3 -m pip install -r ${SRC}/requirements.txt + python3 -m pip install --editable gguf-py +fi + ret=0 -# This is necessary to test if a variable is set while `set -u` is enabled. -# see: https://stackoverflow.com/a/13864829 -# [[ -z ${var+x} ]] evaluates to false if var is set -# [[ ! -z ${var+x} ]] evaluates to true if var is set -if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then - test "${ret}" -eq 0 && gg_run ctest_debug - test "${ret}" -eq 0 && gg_run ctest_release - exit "${ret}" -fi # Otherwise, do extended testing - -rm -rf ${SRC}/models-mnt - -mnt_models=${MNT}/models -mkdir -p ${mnt_models} -ln -sfn ${mnt_models} ${SRC}/models-mnt - -# Create a fresh python3 venv and enter it -rm -rf "${MNT}/venv" -python3 -m venv "${MNT}/venv" -source "${MNT}/venv/bin/activate" - -pip install --disable-pip-version-check -r ${SRC}/requirements.txt -pip install --disable-pip-version-check --editable gguf-py - test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release -# Run tests with open_llama -if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then - if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then - test $ret -eq 0 && gg_run open_llama_7b_v2 - else - test $ret -eq 0 && gg_run open_llama_3b_v2 +if [ -z ${GG_BUILD_LOW_PERF} ]; then + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then + if [ -z ${GG_BUILD_CUDA} ]; then + test $ret -eq 0 && gg_run open_llama_3b_v2 + else + test $ret -eq 0 && gg_run open_llama_7b_v2 + fi fi - test $ret -eq 0 && gg_run ctest_with_model fi exit $ret diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py old mode 100755 new mode 100644 index 1ba5864dc..206b7d5ff --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import torch import os from pprint import pprint diff --git a/llama.cpp b/llama.cpp index cb0546c95..d6c192441 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2372,8 +2372,7 @@ struct llama_model_loader { } } - // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2405,9 +2404,7 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - return false; - } + progress_callback((float) size_done / size_data, progress_callback_user_data); } const size_t offs = file_offset(ggml_get_name(cur)); @@ -2469,11 +2466,8 @@ struct llama_model_loader { } if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); + progress_callback(1.0f, progress_callback_user_data); } - return true; } }; @@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -// Returns false if cancelled by progress_callback -static bool llm_load_tensors( +static void llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -3729,20 +3722,16 @@ static bool llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { - return false; - } + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; - return true; } -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return 0; + return true; } - if (!llm_load_tensors( + llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - )) { - return -2; - } + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return -1; + return false; } - return 0; + return true; } // @@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } - return true; }; } - int status = llama_model_load(path_model, *model, params); - GGML_ASSERT(status <= 0); - if (status < 0) { - if (status == -1) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - } else if (status == -2) { - LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); - } + if (!llama_model_load(path_model, *model, params)) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; } diff --git a/llama.h b/llama.h index af76bae2d..0be4b1337 100644 --- a/llama.h +++ b/llama.h @@ -127,7 +127,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void *ctx); + typedef void (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -180,9 +180,7 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. - // If the provided progress_callback returns true, model loading continues. - // If it returns false, model loading is immediately aborted. + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback diff --git a/requirements-convert-llama-ggml-to-gguf.txt b/requirements-convert-llama-ggml-to-gguf.txt deleted file mode 100644 index 8a5377762..000000000 --- a/requirements-convert-llama-ggml-to-gguf.txt +++ /dev/null @@ -1 +0,0 @@ --r requirements-convert.txt diff --git a/requirements-convert-lora-to-ggml.txt b/requirements-convert-lora-to-ggml.txt deleted file mode 100644 index 30827c896..000000000 --- a/requirements-convert-lora-to-ggml.txt +++ /dev/null @@ -1,2 +0,0 @@ --r requirements-convert.txt -torch==2.1.1 diff --git a/requirements-convert-persimmon-to-gguf.txt b/requirements-convert-persimmon-to-gguf.txt deleted file mode 100644 index 30827c896..000000000 --- a/requirements-convert-persimmon-to-gguf.txt +++ /dev/null @@ -1,2 +0,0 @@ --r requirements-convert.txt -torch==2.1.1 diff --git a/requirements-convert.txt b/requirements-convert.txt deleted file mode 100644 index 1a1162566..000000000 --- a/requirements-convert.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy==1.24.4 -sentencepiece==0.1.98 -transformers>=4.34.0 -gguf>=0.1.0 -protobuf>=4.21.0 diff --git a/requirements-convert-hf-to-gguf.txt b/requirements-hf-to-gguf.txt similarity index 54% rename from requirements-convert-hf-to-gguf.txt rename to requirements-hf-to-gguf.txt index 4d00b1966..f4600539e 100644 --- a/requirements-convert-hf-to-gguf.txt +++ b/requirements-hf-to-gguf.txt @@ -1,3 +1,3 @@ --r requirements-convert.txt +-r requirements.txt torch==2.1.1 transformers==4.35.2 diff --git a/requirements.txt b/requirements.txt index da4f3f9a8..1a1162566 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,5 @@ -# These requirements include all dependencies for all top-level python scripts -# for llama.cpp. Avoid adding packages here directly. -# -# Package versions must stay compatible across all top-level python scripts. -# - --r requirements-convert.txt - --r requirements-convert-hf-to-gguf.txt --r requirements-convert-lora-to-ggml.txt --r requirements-convert-persimmon-to-gguf.txt +numpy==1.24.4 +sentencepiece==0.1.98 +transformers>=4.34.0 +gguf>=0.1.0 +protobuf>=4.21.0 diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index 59be43b99..000000000 --- a/tests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!*.* diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 81a02dae9..e42237c7a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,20 +8,14 @@ endfunction() function(llama_test_executable name source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_test(NAME ${name} COMMAND $ ${ARGN}) - set_property(TEST ${name} PROPERTY LABELS "main") endfunction() function(llama_build_and_test_executable source) - llama_build_and_test_executable_with_label(${source} "main") -endfunction() - -function(llama_build_and_test_executable_with_label source label) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE llama common) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) - set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label}) endfunction() # llama_build_and_test_executable(test-double-float.cpp) # SLOW @@ -57,8 +51,6 @@ llama_build_and_test_executable(test-backend-ops.cpp) llama_build_and_test_executable(test-rope.cpp) -llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model") - # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp deleted file mode 100644 index 509f3e8e0..000000000 --- a/tests/test-model-load-cancel.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "llama.h" - -#include -#include -#include - -int main(void) { - const char * models_to_try[] = { - // Same default as example/main for local use - "./models/7B/ggml-model-f16.gguf", - // Models for ./ci/run.sh - "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf", - "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf", - }; - - const char * chosen_model; - for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) { - const auto * model = models_to_try[i]; - - auto * file = fopen(model, "r"); - if (file == nullptr) { - continue; - } - - chosen_model = model; - fprintf(stderr, "using '%s'\n", model); - fclose(file); - } - - if (chosen_model == nullptr) { - fprintf(stderr, "no model found\n"); - return EXIT_FAILURE; - } - - llama_backend_init(false); - auto params = llama_model_params{}; - params.use_mmap = false; - params.progress_callback = [](float progress, void * ctx){ - (void) ctx; - return progress > 0.05; - }; - - auto * model = llama_load_model_from_file(chosen_model, params); - llama_backend_free(); - return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; -}