reset to upstream/master
This commit is contained in:
parent
e4382571ca
commit
f607e53252
18 changed files with 160 additions and 468 deletions
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
|
@ -72,7 +72,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -107,7 +107,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
ubuntu-latest-cmake-mpi:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -141,7 +141,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose
|
ctest --verbose
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
|
@ -202,7 +202,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
macOS-latest-cmake-ios:
|
macOS-latest-cmake-ios:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -394,7 +394,7 @@ jobs:
|
||||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test (Intel SDE)
|
- name: Test (Intel SDE)
|
||||||
id: cmake_test_sde
|
id: cmake_test_sde
|
||||||
|
@ -406,7 +406,7 @@ jobs:
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
cd build
|
cd build
|
||||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
|
27
.github/workflows/python-check-requirements.yml
vendored
27
.github/workflows/python-check-requirements.yml
vendored
|
@ -1,27 +0,0 @@
|
||||||
name: Python check requirements.txt
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- 'check-requirements.sh'
|
|
||||||
- 'convert*.py'
|
|
||||||
- 'requirements*.txt'
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- 'check-requirements.sh'
|
|
||||||
- 'convert*.py'
|
|
||||||
- 'requirements*.txt'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
python-check-requirements:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: check-requirements
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Run check-requirements.sh script
|
|
||||||
run: bash check-requirements.sh nocleanup
|
|
16
.gitignore
vendored
16
.gitignore
vendored
|
@ -86,3 +86,19 @@ examples/jeopardy/results.txt
|
||||||
|
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
|
# Test binaries
|
||||||
|
/tests/test-grammar-parser
|
||||||
|
/tests/test-llama-grammar
|
||||||
|
/tests/test-double-float
|
||||||
|
/tests/test-grad0
|
||||||
|
/tests/test-opt
|
||||||
|
/tests/test-quantize-fns
|
||||||
|
/tests/test-quantize-perf
|
||||||
|
/tests/test-sampling
|
||||||
|
/tests/test-tokenizer-0-llama
|
||||||
|
/tests/test-tokenizer-0-falcon
|
||||||
|
/tests/test-tokenizer-1-llama
|
||||||
|
/tests/test-tokenizer-1-bpe
|
||||||
|
/tests/test-rope
|
||||||
|
/tests/test-backend-ops
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -10,8 +10,6 @@ TEST_TARGETS = \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
tests/test-backend-ops
|
tests/test-backend-ops
|
||||||
# # TODO(crasm): determine how to run tests that depend on openllama model files with make
|
|
||||||
# tests/test-model-load-cancel
|
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -732,5 +730,3 @@ tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
||||||
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS)
|
|
||||||
|
|
|
@ -1,156 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# check-requirements.sh checks all requirements files for each top-level
|
|
||||||
# convert*.py script.
|
|
||||||
#
|
|
||||||
# WARNING: This is quite IO intensive, because a fresh venv is set up for every
|
|
||||||
# python script.
|
|
||||||
#
|
|
||||||
# usage: ./check-requirements.sh [<working_dir>]
|
|
||||||
# ./check-requirements.sh 'nocleanup' [<working_dir>]
|
|
||||||
#
|
|
||||||
# where:
|
|
||||||
# - <working_dir> is a directory that can be used as the base for
|
|
||||||
# setting up the venvs. Defaults to `/tmp`.
|
|
||||||
# - 'nocleanup' as the first argument will disable automatic cleanup
|
|
||||||
# of the files created by this script.
|
|
||||||
#
|
|
||||||
# requires:
|
|
||||||
# - bash >= 3.2.57
|
|
||||||
# - shellcheck
|
|
||||||
#
|
|
||||||
# For each script, it creates a fresh venv, `pip install -r` the
|
|
||||||
# requirements, and finally executes the python script with no arguments to
|
|
||||||
# check for a `ModuleNotFoundError`.
|
|
||||||
#
|
|
||||||
|
|
||||||
log() {
|
|
||||||
local level="$1"; shift
|
|
||||||
local format="$1"; shift
|
|
||||||
# shellcheck disable=SC2059
|
|
||||||
>&2 printf "$level: $format\n" "$@"
|
|
||||||
}
|
|
||||||
|
|
||||||
info() {
|
|
||||||
log 'INFO' "$@"
|
|
||||||
}
|
|
||||||
|
|
||||||
fatal() {
|
|
||||||
log 'FATAL' "$@"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
|
|
||||||
info "Removing $workdir"
|
|
||||||
(
|
|
||||||
count=0
|
|
||||||
rm -rfv "$workdir" | while read -r; do
|
|
||||||
if (( count++ > 750 )); then
|
|
||||||
printf '.'
|
|
||||||
count=0
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
printf '\n'
|
|
||||||
)&
|
|
||||||
wait $!
|
|
||||||
info "Removed '$workdir'"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
abort() {
|
|
||||||
cleanup
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ $1 == nocleanup ]]; then
|
|
||||||
shift # discard nocleanup arg
|
|
||||||
else
|
|
||||||
trap abort SIGINT SIGTERM SIGQUIT SIGABRT
|
|
||||||
trap cleanup EXIT
|
|
||||||
fi
|
|
||||||
|
|
||||||
set -eu -o pipefail
|
|
||||||
this="$(realpath "$0")"
|
|
||||||
readonly this
|
|
||||||
cd "$(dirname "$this")"
|
|
||||||
|
|
||||||
shellcheck "$this"
|
|
||||||
|
|
||||||
workdir=
|
|
||||||
if [[ -n ${1+x} ]]; then
|
|
||||||
arg_dir="$(realpath "$1")"
|
|
||||||
if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then
|
|
||||||
fatal "$arg_dir is not a valid directory"
|
|
||||||
fi
|
|
||||||
workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")"
|
|
||||||
else
|
|
||||||
workdir="$(mktemp -d "/tmp/check-requirements.XXXX")"
|
|
||||||
fi
|
|
||||||
readonly workdir
|
|
||||||
|
|
||||||
info "Working directory: $workdir"
|
|
||||||
|
|
||||||
assert_arg_count() {
|
|
||||||
local argcount="$1"; shift
|
|
||||||
if (( $# != argcount )); then
|
|
||||||
fatal "${FUNCNAME[1]}: incorrect number of args"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
check_requirements() {
|
|
||||||
assert_arg_count 2 "$@"
|
|
||||||
local venv="$1"
|
|
||||||
local reqs="$2"
|
|
||||||
|
|
||||||
info "$reqs: beginning check"
|
|
||||||
(
|
|
||||||
# shellcheck source=/dev/null
|
|
||||||
source "$venv/bin/activate"
|
|
||||||
pip --disable-pip-version-check install -q -r "$reqs"
|
|
||||||
)
|
|
||||||
info "$reqs: OK"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_convert_script() {
|
|
||||||
assert_arg_count 1 "$@"
|
|
||||||
local py="$1"
|
|
||||||
local pyname="${py%.py}"
|
|
||||||
|
|
||||||
info "$py: beginning check"
|
|
||||||
|
|
||||||
local reqs="requirements-$pyname.txt"
|
|
||||||
if [[ ! -r "$reqs" ]]; then
|
|
||||||
fatal "$py missing requirements. Expected: $reqs"
|
|
||||||
fi
|
|
||||||
|
|
||||||
local venv="$workdir/$pyname-venv"
|
|
||||||
python3 -m venv "$venv"
|
|
||||||
|
|
||||||
check_requirements "$venv" "$reqs"
|
|
||||||
set +e
|
|
||||||
(
|
|
||||||
# shellcheck source=/dev/null
|
|
||||||
source "$venv/bin/activate"
|
|
||||||
py_err="$workdir/$pyname.out"
|
|
||||||
python "$py" 2> "$py_err"
|
|
||||||
>&2 cat "$py_err"
|
|
||||||
grep -e 'ModuleNotFoundError' "$py_err"
|
|
||||||
)
|
|
||||||
set -e
|
|
||||||
# shellcheck disable=SC2181
|
|
||||||
(( $? )) && fatal "$py: some imports not declared in $reqs"
|
|
||||||
info "$py: imports OK"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check requirements.txt
|
|
||||||
all_venv="$workdir/all-venv"
|
|
||||||
python3 -m venv "$all_venv"
|
|
||||||
check_requirements "$all_venv" 'requirements.txt'
|
|
||||||
|
|
||||||
check_convert_script 'convert.py'
|
|
||||||
for py in convert-*.py; do
|
|
||||||
check_convert_script "$py"
|
|
||||||
done
|
|
||||||
|
|
||||||
info "Done! No issues found."
|
|
276
ci/run.sh
276
ci/run.sh
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/bash
|
#/bin/bash
|
||||||
#
|
#
|
||||||
# sample usage:
|
# sample usage:
|
||||||
#
|
#
|
||||||
|
@ -11,8 +11,6 @@
|
||||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
|
||||||
set -u # Fail on unset variables
|
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -24,28 +22,16 @@ mkdir -p "$2"
|
||||||
OUT=$(realpath "$1")
|
OUT=$(realpath "$1")
|
||||||
MNT=$(realpath "$2")
|
MNT=$(realpath "$2")
|
||||||
|
|
||||||
rm -fv $OUT/*.log
|
rm -v $OUT/*.log
|
||||||
rm -fv $OUT/*.exit
|
rm -v $OUT/*.exit
|
||||||
rm -fv $OUT/*.md
|
rm -v $OUT/*.md
|
||||||
|
|
||||||
sd=`dirname $0`
|
sd=`dirname $0`
|
||||||
cd $sd/../
|
cd $sd/../
|
||||||
SRC=`pwd`
|
SRC=`pwd`
|
||||||
|
|
||||||
# Read-only array of quantization types for iteration.
|
|
||||||
# Use ${quants[@]:1} to skip f16.
|
|
||||||
declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k )
|
|
||||||
|
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# Print an error message to stderr and exit with an error.
|
|
||||||
# usage: die <format-string> <format-args>
|
|
||||||
function die {
|
|
||||||
local format="$1"; shift
|
|
||||||
>&2 printf "$format" "$@"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
function gg_wget {
|
function gg_wget {
|
||||||
local out=$1
|
local out=$1
|
||||||
|
@ -91,16 +77,14 @@ function gg_run {
|
||||||
function gg_run_ctest_debug {
|
function gg_run_ctest_debug {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
rm -rf build-ci-debug
|
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||||
mkdir build-ci-debug
|
|
||||||
cd build-ci-debug
|
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -121,19 +105,17 @@ function gg_sum_ctest_debug {
|
||||||
function gg_run_ctest_release {
|
function gg_run_ctest_release {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
rm -rf build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
mkdir build-ci-release
|
|
||||||
cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
else
|
else
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
@ -149,91 +131,84 @@ function gg_sum_ctest_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_run_ctest_with_model {
|
|
||||||
cd ${SRC}
|
|
||||||
cd build-ci-release
|
|
||||||
set -e
|
|
||||||
(time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_ctest_with_model {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs ctest with model files\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# open_llama_3b_v2
|
# open_llama_3b_v2
|
||||||
|
|
||||||
function gg_run_open_llama_3b_v2 {
|
function gg_run_open_llama_3b_v2 {
|
||||||
# We use absolute paths here to not have to track CWD as much
|
cd ${SRC}
|
||||||
local models_mnt="$(realpath "${SRC}/models-mnt")"
|
|
||||||
local path_models="${models_mnt}/open-llama/3B-v2"
|
|
||||||
local path_wiki="${models_mnt}/wikitext"
|
|
||||||
local path_wiki_raw="${path_wiki}/wikitext-2-raw"
|
|
||||||
|
|
||||||
mkdir -p "${path_models}" "${path_wiki}"
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget "${path_wiki}" https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
path_models="../models-mnt/open-llama/3B-v2"
|
||||||
unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
rm -rf "${SRC}/build-ci-release"
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
mkdir "${SRC}/build-ci-release"
|
|
||||||
cd "${SRC}/build-ci-release"
|
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log"
|
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a "${OUT}/${ci}-make.log"
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 "${SRC}/convert.py" "${path_models}"
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
# Get the model path for a quantization
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
# usage: model_for <quant>
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
function model_for {
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
if (( $# != 1 )); then
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
die 'model_for takes a single quantization, such as q8_0'
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
fi
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
echo -n "${path_models}/ggml-model-$1.gguf"
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
}
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
wiki_test_60="${path_wiki_raw}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
# Quantize q8_0 through q6_k
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
for q in "${quants[@]:1}"; do
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}"
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
done
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
# Run basic inference for all quants
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
for q in "${quants[@]}"; do
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
( time \
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
./bin/main --model "$(model_for "${q}")" -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is"
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log"
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
done
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
# Run perplexity with wiki_test_60
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
for q in "${quants[@]}"; do
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
( time \
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
done
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
# Run examples/save-load-state with q4_0
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
( time \
|
|
||||||
./bin/save-load-state --model "$(model_for q4_0)"
|
|
||||||
) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log"
|
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -248,11 +223,17 @@ function gg_run_open_llama_3b_v2 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check perplexity results for all quants
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
for q in "${quants[@]}"; do
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log" | grep "^\[1\]")" \
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
| tee -a "${OUT}/${ci}-ppl.log"
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
done
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
# lora
|
# lora
|
||||||
function compare_ppl {
|
function compare_ppl {
|
||||||
|
@ -269,42 +250,32 @@ function gg_run_open_llama_3b_v2 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
local path_lora="${path_models}/lora"
|
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||||
local path_shakespeare="${models_mnt}/shakespeare"
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
|
|
||||||
local shakespeare="${path_shakespeare}/shakespeare.txt"
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
local lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}"
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
# f16
|
# f16
|
||||||
(time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log"
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
(time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log"
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
compare_ppl "f16 shakespeare" \
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \
|
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \
|
|
||||||
| tee -a "${OUT}/${ci}-lora-ppl.log"
|
|
||||||
|
|
||||||
# q8_0
|
# q8_0
|
||||||
(time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log"
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
(time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log"
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
compare_ppl "q8_0 shakespeare" \
|
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
|
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \
|
|
||||||
| tee -a "${OUT}/${ci}-lora-ppl.log"
|
|
||||||
|
|
||||||
# q8_0 + f16 lora-base
|
# q8_0 + f16 lora-base
|
||||||
( time \
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2
|
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log"
|
|
||||||
compare_ppl "q8_0 / f16 base shakespeare" \
|
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
|
|
||||||
"$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \
|
|
||||||
| tee -a "${OUT}/${ci}-lora-ppl.log"
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -514,43 +485,30 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
rm -rf ${SRC}/models-mnt
|
||||||
|
|
||||||
|
mnt_models=${MNT}/models
|
||||||
|
mkdir -p ${mnt_models}
|
||||||
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
|
python3 -m pip install -r ${SRC}/requirements.txt
|
||||||
|
python3 -m pip install --editable gguf-py
|
||||||
|
fi
|
||||||
|
|
||||||
ret=0
|
ret=0
|
||||||
|
|
||||||
# This is necessary to test if a variable is set while `set -u` is enabled.
|
|
||||||
# see: https://stackoverflow.com/a/13864829
|
|
||||||
# [[ -z ${var+x} ]] evaluates to false if var is set
|
|
||||||
# [[ ! -z ${var+x} ]] evaluates to true if var is set
|
|
||||||
if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then
|
|
||||||
test "${ret}" -eq 0 && gg_run ctest_debug
|
|
||||||
test "${ret}" -eq 0 && gg_run ctest_release
|
|
||||||
exit "${ret}"
|
|
||||||
fi # Otherwise, do extended testing
|
|
||||||
|
|
||||||
rm -rf ${SRC}/models-mnt
|
|
||||||
|
|
||||||
mnt_models=${MNT}/models
|
|
||||||
mkdir -p ${mnt_models}
|
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
|
||||||
rm -rf "${MNT}/venv"
|
|
||||||
python3 -m venv "${MNT}/venv"
|
|
||||||
source "${MNT}/venv/bin/activate"
|
|
||||||
|
|
||||||
pip install --disable-pip-version-check -r ${SRC}/requirements.txt
|
|
||||||
pip install --disable-pip-version-check --editable gguf-py
|
|
||||||
|
|
||||||
test $ret -eq 0 && gg_run ctest_debug
|
test $ret -eq 0 && gg_run ctest_debug
|
||||||
test $ret -eq 0 && gg_run ctest_release
|
test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
# Run tests with open_llama
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
exit $ret
|
exit $ret
|
||||||
|
|
1
convert-persimmon-to-gguf.py
Executable file → Normal file
1
convert-persimmon-to-gguf.py
Executable file → Normal file
|
@ -1,4 +1,3 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
import torch
|
import torch
|
||||||
import os
|
import os
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
46
llama.cpp
46
llama.cpp
|
@ -2372,8 +2372,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
||||||
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
|
|
||||||
size_t size_data = 0;
|
size_t size_data = 0;
|
||||||
|
|
||||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||||
|
@ -2405,9 +2404,7 @@ struct llama_model_loader {
|
||||||
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
progress_callback((float) size_done / size_data, progress_callback_user_data);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t offs = file_offset(ggml_get_name(cur));
|
const size_t offs = file_offset(ggml_get_name(cur));
|
||||||
|
@ -2469,11 +2466,8 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
// Even though the model is done loading, we still honor
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
// cancellation since we need to free allocations.
|
|
||||||
return progress_callback(1.0f, progress_callback_user_data);
|
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
static void llm_load_tensors(
|
||||||
static bool llm_load_tensors(
|
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -3729,20 +3722,16 @@ static bool llm_load_tensors(
|
||||||
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
|
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
model.mapping = std::move(ml.mapping);
|
model.mapping = std::move(ml.mapping);
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
||||||
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||||
|
|
||||||
|
@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
||||||
|
|
||||||
if (params.vocab_only) {
|
if (params.vocab_only) {
|
||||||
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
||||||
return 0;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llm_load_tensors(
|
llm_load_tensors(
|
||||||
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||||
params.progress_callback, params.progress_callback_user_data
|
params.progress_callback, params.progress_callback_user_data
|
||||||
)) {
|
);
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||||
return -1;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file(
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
int status = llama_model_load(path_model, *model, params);
|
if (!llama_model_load(path_model, *model, params)) {
|
||||||
GGML_ASSERT(status <= 0);
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||||
if (status < 0) {
|
|
||||||
if (status == -1) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
||||||
} else if (status == -2) {
|
|
||||||
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
|
||||||
}
|
|
||||||
delete model;
|
delete model;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -127,7 +127,7 @@ extern "C" {
|
||||||
bool sorted;
|
bool sorted;
|
||||||
} llama_token_data_array;
|
} llama_token_data_array;
|
||||||
|
|
||||||
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
// Input data for llama_decode
|
// Input data for llama_decode
|
||||||
// A llama_batch object can contain input about one or many sequences
|
// A llama_batch object can contain input about one or many sequences
|
||||||
|
@ -180,9 +180,7 @@ extern "C" {
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
// If the provided progress_callback returns true, model loading continues.
|
|
||||||
// If it returns false, model loading is immediately aborted.
|
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
-r requirements-convert.txt
|
|
|
@ -1,2 +0,0 @@
|
||||||
-r requirements-convert.txt
|
|
||||||
torch==2.1.1
|
|
|
@ -1,2 +0,0 @@
|
||||||
-r requirements-convert.txt
|
|
||||||
torch==2.1.1
|
|
|
@ -1,5 +0,0 @@
|
||||||
numpy==1.24.4
|
|
||||||
sentencepiece==0.1.98
|
|
||||||
transformers>=4.34.0
|
|
||||||
gguf>=0.1.0
|
|
||||||
protobuf>=4.21.0
|
|
|
@ -1,3 +1,3 @@
|
||||||
-r requirements-convert.txt
|
-r requirements.txt
|
||||||
torch==2.1.1
|
torch==2.1.1
|
||||||
transformers==4.35.2
|
transformers==4.35.2
|
|
@ -1,11 +1,5 @@
|
||||||
# These requirements include all dependencies for all top-level python scripts
|
numpy==1.24.4
|
||||||
# for llama.cpp. Avoid adding packages here directly.
|
sentencepiece==0.1.98
|
||||||
#
|
transformers>=4.34.0
|
||||||
# Package versions must stay compatible across all top-level python scripts.
|
gguf>=0.1.0
|
||||||
#
|
protobuf>=4.21.0
|
||||||
|
|
||||||
-r requirements-convert.txt
|
|
||||||
|
|
||||||
-r requirements-convert-hf-to-gguf.txt
|
|
||||||
-r requirements-convert-lora-to-ggml.txt
|
|
||||||
-r requirements-convert-persimmon-to-gguf.txt
|
|
||||||
|
|
2
tests/.gitignore
vendored
2
tests/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
||||||
*
|
|
||||||
!*.*
|
|
|
@ -8,20 +8,14 @@ endfunction()
|
||||||
function(llama_test_executable name source)
|
function(llama_test_executable name source)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
set_property(TEST ${name} PROPERTY LABELS "main")
|
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
function(llama_build_and_test_executable source)
|
function(llama_build_and_test_executable source)
|
||||||
llama_build_and_test_executable_with_label(${source} "main")
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
function(llama_build_and_test_executable_with_label source label)
|
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_executable(${TEST_TARGET} ${source})
|
add_executable(${TEST_TARGET} ${source})
|
||||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE llama common)
|
target_link_libraries(${TEST_TARGET} PRIVATE llama common)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
|
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
||||||
|
@ -57,8 +51,6 @@ llama_build_and_test_executable(test-backend-ops.cpp)
|
||||||
|
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
|
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||||
add_executable(${TEST_TARGET} test-c.c)
|
add_executable(${TEST_TARGET} test-c.c)
|
||||||
|
|
|
@ -1,46 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
int main(void) {
|
|
||||||
const char * models_to_try[] = {
|
|
||||||
// Same default as example/main for local use
|
|
||||||
"./models/7B/ggml-model-f16.gguf",
|
|
||||||
// Models for ./ci/run.sh
|
|
||||||
"./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf",
|
|
||||||
"./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf",
|
|
||||||
};
|
|
||||||
|
|
||||||
const char * chosen_model;
|
|
||||||
for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) {
|
|
||||||
const auto * model = models_to_try[i];
|
|
||||||
|
|
||||||
auto * file = fopen(model, "r");
|
|
||||||
if (file == nullptr) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
chosen_model = model;
|
|
||||||
fprintf(stderr, "using '%s'\n", model);
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (chosen_model == nullptr) {
|
|
||||||
fprintf(stderr, "no model found\n");
|
|
||||||
return EXIT_FAILURE;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_backend_init(false);
|
|
||||||
auto params = llama_model_params{};
|
|
||||||
params.use_mmap = false;
|
|
||||||
params.progress_callback = [](float progress, void * ctx){
|
|
||||||
(void) ctx;
|
|
||||||
return progress > 0.05;
|
|
||||||
};
|
|
||||||
|
|
||||||
auto * model = llama_load_model_from_file(chosen_model, params);
|
|
||||||
llama_backend_free();
|
|
||||||
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue