Merge branch 'master' into testtarget-removal

This commit is contained in:
Georgi Gerganov 2023-07-21 13:08:59 +03:00 committed by GitHub
commit 647cef8bbd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
36 changed files with 1009 additions and 625 deletions

5
.gitignore vendored
View file

@ -16,6 +16,8 @@ build/
build-em/ build-em/
build-debug/ build-debug/
build-release/ build-release/
build-ci-debug/
build-ci-release/
build-static/ build-static/
build-cublas/ build-cublas/
build-opencl/ build-opencl/
@ -25,9 +27,10 @@ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/
out/ out/
tmp/
models/* models/*
*.bin models-mnt
/main /main
/quantize /quantize

View file

@ -186,7 +186,16 @@ if (LLAMA_BLAS)
pkg_check_modules(DepBLAS REQUIRED flexiblas_api) pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path # all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl) pkg_check_modules(DepBLAS mkl-sdl)
if (NOT DepBLAS)
if (BUILD_SHARED_LIBS)
set(LINK_METHOD dynamic)
else()
set(LINK_METHOD static)
endif()
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config # this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own # suggest to assign BLAS_INCLUDE_DIRS on your own
@ -512,6 +521,7 @@ if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>) add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
install(TARGETS ggml_shared LIBRARY)
endif() endif()
add_library(llama add_library(llama
@ -533,8 +543,32 @@ if (BUILD_SHARED_LIBS)
if (LLAMA_METAL) if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif() endif()
install(TARGETS llama LIBRARY)
endif() endif()
include(GNUInstallDirs)
install(
FILES convert.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
install(
FILES convert-lora-to-ggml.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
# #
# programs, examples and tests # programs, examples and tests

View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@ -93,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
CXXFLAGS += -pthread CXXFLAGS += -pthread
endif endif
# detect Windows
ifneq ($(findstring _NT,$(UNAME_S)),)
_WIN32 := 1
endif
# library name prefix
ifneq ($(_WIN32),1)
LIB_PRE := lib
endif
# Dynamic Shared Object extension
ifneq ($(_WIN32),1)
DSO_EXT := .so
else
DSO_EXT := .dll
endif
# Windows Sockets 2 (Winsock) for network-capable apps
ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32
endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg CFLAGS += -pg
CXXFLAGS += -pg CXXFLAGS += -pg
@ -297,7 +319,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS) rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
# #
# Examples # Examples
@ -328,14 +350,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
```bash ```bash
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . --config Release cmake --build . --config Release
``` ```
@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
```bash ```bash
# run the verification script # run the verification script
python3 .\scripts\verify-checksum-models.py ./scripts/verify-checksum-models.py
``` ```
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:

20
ci/README.md Normal file
View file

@ -0,0 +1,20 @@
# CI
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
https://github.com/ggml-org/ci
It monitors the `master` branch for new commits and runs the
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
to cover various hardware architectures, including GPU and Apple Silicon instances.
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
Only the branches of this repo are monitored for this keyword.
It is a good practice, before publishing changes to execute the full CI locally on your machine:
```bash
mkdir tmp
bash ./ci/run.sh ./tmp/results ./tmp/mnt
```

262
ci/run.sh Normal file
View file

@ -0,0 +1,262 @@
#/bin/bash
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
exit 1
fi
mkdir -p "$1"
mkdir -p "$2"
OUT=$(realpath "$1")
MNT=$(realpath "$2")
rm -v $OUT/*.log
rm -v $OUT/*.exit
rm -v $OUT/*.md
sd=`dirname $0`
cd $sd/../
SRC=`pwd`
## helpers
# download a file if it does not exist or if it is outdated
function gg_wget {
local out=$1
local url=$2
local cwd=`pwd`
mkdir -p $out
cd $out
# should not re-download if file is the same
wget -nv -N $url
cd $cwd
}
function gg_printf {
printf -- "$@" >> $OUT/README.md
}
function gg_run {
ci=$1
set -o pipefail
set -x
gg_run_$ci | tee $OUT/$ci.log
cur=$?
echo "$cur" > $OUT/$ci.exit
set +x
set +o pipefail
gg_sum_$ci
ret=$((ret | cur))
}
## ci
# ctest_debug
function gg_run_ctest_debug {
cd ${SRC}
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
set -e
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
function gg_sum_ctest_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
gg_printf '\n'
}
# ctest_release
function gg_run_ctest_release {
cd ${SRC}
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z $GG_BUILD_LOW_PERF ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e
}
function gg_sum_ctest_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# open_llama_3b_v2
function gg_run_open_llama_3b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.bin"
model_q8_0="${path_models}/ggml-model-q8_0.bin"
model_q4_0="${path_models}/ggml-model-q4_0.bin"
model_q4_1="${path_models}/ggml-model-q4_1.bin"
model_q5_0="${path_models}/ggml-model-q5_0.bin"
model_q5_1="${path_models}/ggml-model-q5_1.bin"
model_q3_k="${path_models}/ggml-model-q3_k.bin"
model_q4_k="${path_models}/ggml-model-q4_k.bin"
model_q5_k="${path_models}/ggml-model-q5_k.bin"
model_q6_k="${path_models}/ggml-model-q6_k.bin"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
set +e
}
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
}
## main
if [ -z $GG_BUILD_LOW_PERF ]; then
rm -rf ${SRC}/models-mnt
mnt_models=$(realpath ${MNT}/models)
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt
python3 -m pip install -r ${SRC}/requirements.txt
fi
ret=0
#test $ret -eq 0 && gg_run ctest_debug
#test $ret -eq 0 && gg_run ctest_release
if [ -z $GG_BUILD_LOW_PERF ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
fi
exit $ret

1
convert-lora-to-ggml.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python
import json import json
import os import os
import re import re

1
convert.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python
import argparse import argparse
import concurrent.futures import concurrent.futures
import copy import copy

View file

@ -2,21 +2,21 @@
set -e set -e
AI_NAME="${AI_NAME:-Miku}" AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}" USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use. # Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}" #N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}" N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048 --ctx_size "$CTX_SIZE"
--keep -1 --keep -1
--repeat_last_n 256 --repeat_last_n 256
--repeat_penalty 1.17647 --repeat_penalty 1.17647
--temp 0.7 --temp 0.6
--top_k 40 --mirostat 2)
--top_p 0.5)
if [ -n "$N_THREAD" ]; then if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
@ -24,16 +24,17 @@ fi
./main "${GEN_OPTIONS[@]}" \ ./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \
--color --interactive \ --color --interactive \
--reverse-prompt "${USER_NAME}:" \ --reverse-prompt "${USER_NAME}:" \
--prompt " --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad. ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her. ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME} The conversation is only between ${USER_NAME} and ${AI_NAME}.
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos. ${AI_NAME} can only communicate through text, so she can't send images or videos.

View file

@ -1,4 +1,5 @@
set(TARGET baby-llama) set(TARGET baby-llama)
add_executable(${TARGET} baby-llama.cpp) add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,5 +1,6 @@
set(TARGET benchmark) set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_keep = std::stoi(argv[i]); params.n_keep = std::stoi(argv[i]);
} else if (arg == "--chunks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_chunks = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }

View file

@ -28,6 +28,7 @@ struct gpt_params {
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs

View file

@ -1,5 +1,6 @@
set(TARGET embdinput) set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h) add_library(${TARGET} embd-input-lib.cpp embd-input.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
@ -8,6 +9,7 @@ endif()
set(TARGET embd-input-test) set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp) add_executable(${TARGET} embd-input-test.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -1,5 +1,6 @@
set(TARGET embedding) set(TARGET embedding)
add_executable(${TARGET} embedding.cpp) add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -1,5 +1,6 @@
set(TARGET main) set(TARGET main)
add_executable(${TARGET} main.cpp) add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -1,3 +1,4 @@
set(TEST_TARGET metal) set(TEST_TARGET metal)
add_executable(${TEST_TARGET} metal.cpp) add_executable(${TEST_TARGET} metal.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE ggml) target_link_libraries(${TEST_TARGET} PRIVATE ggml)

View file

@ -1,5 +1,6 @@
set(TARGET perplexity) set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp) add_executable(${TARGET} perplexity.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true); auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0; const int n_chunk_max = tokens.size() / params.n_ctx;
const int n_chunk = tokens.size() / params.n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0; double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {

View file

@ -1,4 +1,5 @@
set(TARGET quantize-stats) set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp) add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,5 +1,6 @@
set(TARGET quantize) set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -14,103 +14,27 @@ struct quant_option {
}; };
static const std::vector<struct quant_option> QUANT_OPTIONS = { static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
"Q4_0", { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q4_0, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
},
{
"Q4_1",
LLAMA_FTYPE_MOSTLY_Q4_1,
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
},
{
"Q5_0",
LLAMA_FTYPE_MOSTLY_Q5_0,
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
},
{
"Q5_1",
LLAMA_FTYPE_MOSTLY_Q5_1,
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
},
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
{ { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
"Q2_K", { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
LLAMA_FTYPE_MOSTLY_Q2_K, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
}, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
{ { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
"Q3_K", { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q3_K_M, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
"alias for Q3_K_M" { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
}, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
{ { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
"Q3_K_S", { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q3_K_S,
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_M",
LLAMA_FTYPE_MOSTLY_Q3_K_M,
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_L",
LLAMA_FTYPE_MOSTLY_Q3_K_L,
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
},
{
"Q4_K",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
"alias for Q4_K_M",
},
{
"Q4_K_S",
LLAMA_FTYPE_MOSTLY_Q4_K_S,
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
},
{
"Q4_K_M",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
},
{
"Q5_K",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
"alias for Q5_K_M",
},
{
"Q5_K_S",
LLAMA_FTYPE_MOSTLY_Q5_K_S,
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
},
{
"Q5_K_M",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
},
{
"Q6_K",
LLAMA_FTYPE_MOSTLY_Q6_K,
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
},
#endif #endif
{ { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
"Q8_0", { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
LLAMA_FTYPE_MOSTLY_Q8_0, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
},
{
"F16",
LLAMA_FTYPE_MOSTLY_F16,
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
},
{
"F32",
LLAMA_FTYPE_ALL_F32,
"26.00G @ 7B - absolutely huge, lossless - not recommended",
},
}; };

View file

@ -1,5 +1,6 @@
set(TARGET save-load-state) set(TARGET save-load-state)
add_executable(${TARGET} save-load-state.cpp) add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -2,10 +2,14 @@ set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp json.hpp httplib.h) add_executable(${TARGET} server.cpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1,5 +1,6 @@
set(TARGET simple) set(TARGET simple)
add_executable(${TARGET} simple.cpp) add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)

View file

@ -1,4 +1,5 @@
set(TARGET train-text-from-scratch) set(TARGET train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp) add_executable(${TARGET} train-text-from-scratch.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -6,24 +6,27 @@
outputs = { self, nixpkgs, flake-utils }: outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system: flake-utils.lib.eachDefaultSystem (system:
let let
inherit (pkgs.stdenv) isAarch64 isDarwin; inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
inherit (pkgs.lib) optionals; osSpecific = with pkgs; [ openmpi ] ++
isM1 = isAarch64 && isDarwin; (
osSpecific = if isM1 then if isAarch64 && isDarwin then
with pkgs.darwin.apple_sdk_11_0.frameworks; [ with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate Accelerate
MetalKit MetalKit
MetalPerformanceShaders MetalPerformanceShaders
MetalPerformanceShadersGraph MetalPerformanceShadersGraph
] ]
else if isDarwin then else if isAarch32 && isDarwin then
with pkgs.darwin.apple_sdk.frameworks; [ with pkgs.darwin.apple_sdk.frameworks; [
Accelerate Accelerate
CoreGraphics CoreGraphics
CoreVideo CoreVideo
] ]
else else if isx86_32 || isx86_64 then
[ ]; with pkgs; [ mkl ]
else
with pkgs; [ openblas ]
);
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
llama-python = llama-python =
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
@ -31,22 +34,28 @@
packages.default = pkgs.stdenv.mkDerivation { packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp"; name = "llama.cpp";
src = ./.; src = ./.;
postPatch = if isM1 then '' postPatch = ''
substituteInPlace ./ggml-metal.m \ substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
'' else '';
""; nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
nativeBuildInputs = with pkgs; [ cmake ];
buildInputs = osSpecific; buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" ++ (if isAarch64 && isDarwin then [
"-DLLAMA_METAL=ON" "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
"-DLLAMA_METAL=ON"
] else if isx86_32 || isx86_64 then [
"-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
] else [
"-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
]); ]);
installPhase = '' installPhase = ''
runHook preInstall runHook preInstall
mkdir -p $out/bin install -D bin/* -t $out/bin
mv bin/* $out/bin/ install -Dm644 lib*.so -t $out/lib
mv $out/bin/main $out/bin/llama mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server mv $out/bin/server $out/bin/llama-server

View file

@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(void) dst; (void) dst;
} }
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_cpy(src0, dst, nullptr);
(void) src1;
}
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
// recursively assign CUDA buffers until a compute tensor is found // recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op; const ggml_op src0_op = tensor->src[0]->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
} }
} }
@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP:
if (!any_on_device) {
return false;
}
func = ggml_cuda_dup;
break;
case GGML_OP_ADD: case GGML_OP_ADD:
if (!any_on_device) { if (!any_on_device) {
return false; return false;
@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
} }
func = ggml_cuda_cpy; func = ggml_cuda_cpy;
break; break;
case GGML_OP_CONT:
if (!any_on_device) {
return false;
}
func = ggml_cuda_dup;
break;
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:

View file

@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break; } break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break; } break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break; } break;
default: default:
@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q2_K || else if (src0t == GGML_TYPE_Q5_K) {
src0t == GGML_TYPE_Q3_K || [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
src0t == GGML_TYPE_Q4_K || }
src0t == GGML_TYPE_Q5_K || else if (src0t == GGML_TYPE_Q6_K) {
src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
@ -792,7 +795,7 @@ void ggml_metal_graph_compute(
const float eps = 1e-6f; const float eps = 1e-6f;
const int nth = 256; const int nth = 512;
[encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);

File diff suppressed because it is too large Load diff

15
ggml.c
View file

@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
if (&g_state.contexts[i].context == ctx) { if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false; g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); __func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) { if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer); GGML_ALIGNED_FREE(ctx->mem_buffer);
@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) { if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = n_tasks_arr[node_n]; params.nth = n_tasks_arr[node_n];
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
// distribute new work or execute it direct if 1T // distribute new work or execute it direct if 1T
@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) { if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE; params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
ggml_graph_compute_perf_stats_node(node, state->shared);
} else { } else {
break; break;
} }
@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
} }
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
//assert(cgraph->work == NULL);
//assert(cgraph->work_size == 0);
uint64_t size_eval = 0; uint64_t size_eval = 0;
// compute size of intermediate results // compute size of intermediate results
@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT("=== GRAPH ===\n"); GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_tensor * node = cgraph->nodes[i];

View file

@ -555,7 +555,9 @@ struct llama_file_loader {
} }
// skip to the next multiple of 32 bytes // skip to the next multiple of 32 bytes
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR); if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
}
tensor.file_off = file.tell(); tensor.file_off = file.tell();
tensor.name = name; tensor.name = name;
@ -875,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return result; return result;
} }
int llama_max_devices() {
return LLAMA_MAX_DEVICES;
}
bool llama_mmap_supported() { bool llama_mmap_supported() {
return llama_mmap::SUPPORTED; return llama_mmap::SUPPORTED;
} }
@ -2024,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
} }
// Normalize the second derivatives // Normalize the second derivatives
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); {
for (float & value : second_derivatives) { const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
value /= second_derivatives_sum;
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
} }
float cum_sum = 0.0f; float cum_sum = 0.0f;
@ -2205,7 +2220,7 @@ void llama_sample_classifier_free_guidance(
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale,
float smooth_factor) { float smooth_factor) {
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); int64_t t_start_sample_us = ggml_time_us();
assert(ctx); assert(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);

View file

@ -153,6 +153,8 @@ extern "C" {
int32_t n_eval; int32_t n_eval;
}; };
LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();

2
scripts/verify-checksum-models.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/bin/env python3
import os import os
import hashlib import hashlib

View file

@ -1,6 +1,7 @@
function(llama_add_test source) function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source})
install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE llama) target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction() endfunction()

View file

@ -200,4 +200,6 @@ int main(void) {
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
printf("OK\n"); printf("OK\n");
return 0;
} }