Merge branch 'master' into testtarget-removal
This commit is contained in:
commit
647cef8bbd
36 changed files with 1009 additions and 625 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -16,6 +16,8 @@ build/
|
||||||
build-em/
|
build-em/
|
||||||
build-debug/
|
build-debug/
|
||||||
build-release/
|
build-release/
|
||||||
|
build-ci-debug/
|
||||||
|
build-ci-release/
|
||||||
build-static/
|
build-static/
|
||||||
build-cublas/
|
build-cublas/
|
||||||
build-opencl/
|
build-opencl/
|
||||||
|
@ -25,9 +27,10 @@ build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
out/
|
out/
|
||||||
|
tmp/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
*.bin
|
models-mnt
|
||||||
|
|
||||||
/main
|
/main
|
||||||
/quantize
|
/quantize
|
||||||
|
|
|
@ -186,7 +186,16 @@ if (LLAMA_BLAS)
|
||||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
||||||
# all Intel* libraries share the same include path
|
# all Intel* libraries share the same include path
|
||||||
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
pkg_check_modules(DepBLAS mkl-sdl)
|
||||||
|
if (NOT DepBLAS)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set(LINK_METHOD dynamic)
|
||||||
|
else()
|
||||||
|
set(LINK_METHOD static)
|
||||||
|
endif()
|
||||||
|
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
|
||||||
|
endif()
|
||||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
||||||
# this doesn't provide pkg-config
|
# this doesn't provide pkg-config
|
||||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||||
|
@ -512,6 +521,7 @@ if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
|
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
|
||||||
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
|
install(TARGETS ggml_shared LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(llama
|
add_library(llama
|
||||||
|
@ -533,8 +543,32 @@ if (BUILD_SHARED_LIBS)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||||
endif()
|
endif()
|
||||||
|
install(TARGETS llama LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
include(GNUInstallDirs)
|
||||||
|
install(
|
||||||
|
FILES convert.py
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
OWNER_EXECUTE
|
||||||
|
GROUP_READ
|
||||||
|
GROUP_EXECUTE
|
||||||
|
WORLD_READ
|
||||||
|
WORLD_EXECUTE
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
install(
|
||||||
|
FILES convert-lora-to-ggml.py
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
OWNER_EXECUTE
|
||||||
|
GROUP_READ
|
||||||
|
GROUP_EXECUTE
|
||||||
|
WORLD_READ
|
||||||
|
WORLD_EXECUTE
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
#
|
#
|
||||||
# programs, examples and tests
|
# programs, examples and tests
|
||||||
|
|
34
Makefile
34
Makefile
|
@ -1,5 +1,5 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||||
|
@ -93,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# detect Windows
|
||||||
|
ifneq ($(findstring _NT,$(UNAME_S)),)
|
||||||
|
_WIN32 := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
# library name prefix
|
||||||
|
ifneq ($(_WIN32),1)
|
||||||
|
LIB_PRE := lib
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Dynamic Shared Object extension
|
||||||
|
ifneq ($(_WIN32),1)
|
||||||
|
DSO_EXT := .so
|
||||||
|
else
|
||||||
|
DSO_EXT := .dll
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Windows Sockets 2 (Winsock) for network-capable apps
|
||||||
|
ifeq ($(_WIN32),1)
|
||||||
|
LWINSOCK2 := -lws2_32
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_GPROF
|
ifdef LLAMA_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
CXXFLAGS += -pg
|
CXXFLAGS += -pg
|
||||||
|
@ -297,7 +319,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
|
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -328,14 +350,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run the verification script
|
# run the verification script
|
||||||
python3 .\scripts\verify-checksum-models.py
|
./scripts/verify-checksum-models.py
|
||||||
```
|
```
|
||||||
|
|
||||||
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||||
|
|
20
ci/README.md
Normal file
20
ci/README.md
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# CI
|
||||||
|
|
||||||
|
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||||
|
|
||||||
|
https://github.com/ggml-org/ci
|
||||||
|
|
||||||
|
It monitors the `master` branch for new commits and runs the
|
||||||
|
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||||
|
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||||
|
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||||
|
|
||||||
|
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
||||||
|
Only the branches of this repo are monitored for this keyword.
|
||||||
|
|
||||||
|
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir tmp
|
||||||
|
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
```
|
262
ci/run.sh
Normal file
262
ci/run.sh
Normal file
|
@ -0,0 +1,262 @@
|
||||||
|
#/bin/bash
|
||||||
|
|
||||||
|
if [ -z "$2" ]; then
|
||||||
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$1"
|
||||||
|
mkdir -p "$2"
|
||||||
|
|
||||||
|
OUT=$(realpath "$1")
|
||||||
|
MNT=$(realpath "$2")
|
||||||
|
|
||||||
|
rm -v $OUT/*.log
|
||||||
|
rm -v $OUT/*.exit
|
||||||
|
rm -v $OUT/*.md
|
||||||
|
|
||||||
|
sd=`dirname $0`
|
||||||
|
cd $sd/../
|
||||||
|
SRC=`pwd`
|
||||||
|
|
||||||
|
## helpers
|
||||||
|
|
||||||
|
# download a file if it does not exist or if it is outdated
|
||||||
|
function gg_wget {
|
||||||
|
local out=$1
|
||||||
|
local url=$2
|
||||||
|
|
||||||
|
local cwd=`pwd`
|
||||||
|
|
||||||
|
mkdir -p $out
|
||||||
|
cd $out
|
||||||
|
|
||||||
|
# should not re-download if file is the same
|
||||||
|
wget -nv -N $url
|
||||||
|
|
||||||
|
cd $cwd
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_printf {
|
||||||
|
printf -- "$@" >> $OUT/README.md
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_run {
|
||||||
|
ci=$1
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
gg_run_$ci | tee $OUT/$ci.log
|
||||||
|
cur=$?
|
||||||
|
echo "$cur" > $OUT/$ci.exit
|
||||||
|
|
||||||
|
set +x
|
||||||
|
set +o pipefail
|
||||||
|
|
||||||
|
gg_sum_$ci
|
||||||
|
|
||||||
|
ret=$((ret | cur))
|
||||||
|
}
|
||||||
|
|
||||||
|
## ci
|
||||||
|
|
||||||
|
# ctest_debug
|
||||||
|
|
||||||
|
function gg_run_ctest_debug {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_debug {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest in debug mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
# ctest_release
|
||||||
|
|
||||||
|
function gg_run_ctest_release {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
else
|
||||||
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
fi
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_release {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest in release mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
# open_llama_3b_v2
|
||||||
|
|
||||||
|
function gg_run_open_llama_3b_v2 {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
|
path_models="../models-mnt/open-llama/3B-v2"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.bin"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||||
|
|
||||||
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_open_llama_3b_v2 {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
## main
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
rm -rf ${SRC}/models-mnt
|
||||||
|
|
||||||
|
mnt_models=$(realpath ${MNT}/models)
|
||||||
|
mkdir -p ${mnt_models}
|
||||||
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
|
python3 -m pip install -r ${SRC}/requirements.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
ret=0
|
||||||
|
|
||||||
|
#test $ret -eq 0 && gg_run ctest_debug
|
||||||
|
#test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $ret
|
1
convert-lora-to-ggml.py
Normal file → Executable file
1
convert-lora-to-ggml.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
1
convert.py
Normal file → Executable file
1
convert.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
import argparse
|
import argparse
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import copy
|
import copy
|
||||||
|
|
|
@ -2,21 +2,21 @@
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
AI_NAME="${AI_NAME:-Miku}"
|
AI_NAME="${AI_NAME:-Miku}"
|
||||||
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||||
USER_NAME="${USER_NAME:-Anon}"
|
USER_NAME="${USER_NAME:-Anon}"
|
||||||
|
|
||||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||||
#N_THREAD="${N_THREAD:-4}"
|
#N_THREAD="${N_THREAD:-4}"
|
||||||
|
CTX_SIZE="${CTX_SIZE:-4096}"
|
||||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||||
|
|
||||||
GEN_OPTIONS=(--batch_size 1024
|
GEN_OPTIONS=(--batch_size 1024
|
||||||
--ctx_size 2048
|
--ctx_size "$CTX_SIZE"
|
||||||
--keep -1
|
--keep -1
|
||||||
--repeat_last_n 256
|
--repeat_last_n 256
|
||||||
--repeat_penalty 1.17647
|
--repeat_penalty 1.17647
|
||||||
--temp 0.7
|
--temp 0.6
|
||||||
--top_k 40
|
--mirostat 2)
|
||||||
--top_p 0.5)
|
|
||||||
|
|
||||||
if [ -n "$N_THREAD" ]; then
|
if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
|
@ -24,16 +24,17 @@ fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
|
--in-prefix " " \
|
||||||
|
--in-suffix "${AI_NAME}:" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
--color --interactive \
|
--color --interactive \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--prompt "
|
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
||||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
|
||||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
|
||||||
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
|
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
|
||||||
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
|
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
|
||||||
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
The conversation is only between ${USER_NAME} and ${AI_NAME}.
|
||||||
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||||
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET benchmark)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_keep = std::stoi(argv[i]);
|
params.n_keep = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--chunks") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_chunks = std::stoi(argv[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
|
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
if (llama_mlock_supported()) {
|
if (llama_mlock_supported()) {
|
||||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ struct gpt_params {
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET embdinput)
|
set(TARGET embdinput)
|
||||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||||
|
install(TARGETS ${TARGET} LIBRARY)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
@ -8,6 +9,7 @@ endif()
|
||||||
|
|
||||||
set(TARGET embd-input-test)
|
set(TARGET embd-input-test)
|
||||||
add_executable(${TARGET} embd-input-test.cpp)
|
add_executable(${TARGET} embd-input-test.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET embedding)
|
set(TARGET embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET main)
|
set(TARGET main)
|
||||||
add_executable(${TARGET} main.cpp)
|
add_executable(${TARGET} main.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
set(TEST_TARGET metal)
|
set(TEST_TARGET metal)
|
||||||
add_executable(${TEST_TARGET} metal.cpp)
|
add_executable(${TEST_TARGET} metal.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET perplexity)
|
set(TARGET perplexity)
|
||||||
add_executable(${TARGET} perplexity.cpp)
|
add_executable(${TARGET} perplexity.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
int count = 0;
|
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||||
|
|
||||||
const int n_chunk = tokens.size() / params.n_ctx;
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
set(TARGET quantize-stats)
|
set(TARGET quantize-stats)
|
||||||
add_executable(${TARGET} quantize-stats.cpp)
|
add_executable(${TARGET} quantize-stats.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET quantize)
|
set(TARGET quantize)
|
||||||
add_executable(${TARGET} quantize.cpp)
|
add_executable(${TARGET} quantize.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -14,103 +14,27 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
||||||
"Q4_0",
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0,
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
||||||
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_1",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1,
|
|
||||||
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_0",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0,
|
|
||||||
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_1",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
||||||
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
|
||||||
},
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
{
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
||||||
"Q2_K",
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K,
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
||||||
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
||||||
},
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
||||||
{
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
"Q3_K",
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
||||||
"alias for Q3_K_M"
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
},
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
||||||
{
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
||||||
"Q3_K_S",
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
|
||||||
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q3_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
|
||||||
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q3_K_L",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
|
||||||
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
|
||||||
"alias for Q4_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K_S",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
|
||||||
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
|
||||||
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
|
||||||
"alias for Q5_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K_S",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
|
||||||
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
|
||||||
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q6_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K,
|
|
||||||
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
|
||||||
},
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
||||||
"Q8_0",
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0,
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"F16",
|
|
||||||
LLAMA_FTYPE_MOSTLY_F16,
|
|
||||||
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"F32",
|
|
||||||
LLAMA_FTYPE_ALL_F32,
|
|
||||||
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET save-load-state)
|
set(TARGET save-load-state)
|
||||||
add_executable(${TARGET} save-load-state.cpp)
|
add_executable(${TARGET} save-load-state.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -2,10 +2,14 @@ set(TARGET server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
if (WIN32)
|
||||||
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
|
endif()
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
set(TARGET simple)
|
set(TARGET simple)
|
||||||
add_executable(${TARGET} simple.cpp)
|
add_executable(${TARGET} simple.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
set(TARGET train-text-from-scratch)
|
set(TARGET train-text-from-scratch)
|
||||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
35
flake.nix
35
flake.nix
|
@ -6,24 +6,27 @@
|
||||||
outputs = { self, nixpkgs, flake-utils }:
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
let
|
let
|
||||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
|
||||||
inherit (pkgs.lib) optionals;
|
osSpecific = with pkgs; [ openmpi ] ++
|
||||||
isM1 = isAarch64 && isDarwin;
|
(
|
||||||
osSpecific = if isM1 then
|
if isAarch64 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
Accelerate
|
Accelerate
|
||||||
MetalKit
|
MetalKit
|
||||||
MetalPerformanceShaders
|
MetalPerformanceShaders
|
||||||
MetalPerformanceShadersGraph
|
MetalPerformanceShadersGraph
|
||||||
]
|
]
|
||||||
else if isDarwin then
|
else if isAarch32 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk.frameworks; [
|
with pkgs.darwin.apple_sdk.frameworks; [
|
||||||
Accelerate
|
Accelerate
|
||||||
CoreGraphics
|
CoreGraphics
|
||||||
CoreVideo
|
CoreVideo
|
||||||
]
|
]
|
||||||
|
else if isx86_32 || isx86_64 then
|
||||||
|
with pkgs; [ mkl ]
|
||||||
else
|
else
|
||||||
[ ];
|
with pkgs; [ openblas ]
|
||||||
|
);
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
|
@ -31,22 +34,28 @@
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
name = "llama.cpp";
|
||||||
src = ./.;
|
src = ./.;
|
||||||
postPatch = if isM1 then ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
'' else
|
'';
|
||||||
"";
|
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||||
nativeBuildInputs = with pkgs; [ cmake ];
|
|
||||||
buildInputs = osSpecific;
|
buildInputs = osSpecific;
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
||||||
|
++ (if isAarch64 && isDarwin then [
|
||||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
"-DLLAMA_METAL=ON"
|
"-DLLAMA_METAL=ON"
|
||||||
|
] else if isx86_32 || isx86_64 then [
|
||||||
|
"-DLLAMA_BLAS=ON"
|
||||||
|
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
|
||||||
|
] else [
|
||||||
|
"-DLLAMA_BLAS=ON"
|
||||||
|
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||||
]);
|
]);
|
||||||
installPhase = ''
|
installPhase = ''
|
||||||
runHook preInstall
|
runHook preInstall
|
||||||
|
|
||||||
mkdir -p $out/bin
|
install -D bin/* -t $out/bin
|
||||||
mv bin/* $out/bin/
|
install -Dm644 lib*.so -t $out/lib
|
||||||
mv $out/bin/main $out/bin/llama
|
mv $out/bin/main $out/bin/llama
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server $out/bin/llama-server
|
||||||
|
|
||||||
|
|
19
ggml-cuda.cu
19
ggml-cuda.cu
|
@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||||
(void) dst;
|
(void) dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_cpy(src0, dst, nullptr);
|
||||||
|
(void) src1;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
||||||
|
@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||||
// recursively assign CUDA buffers until a compute tensor is found
|
// recursively assign CUDA buffers until a compute tensor is found
|
||||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||||
const ggml_op src0_op = tensor->src[0]->op;
|
const ggml_op src0_op = tensor->src[0]->op;
|
||||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||||
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_DUP:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cuda_dup;
|
||||||
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
if (!any_on_device) {
|
if (!any_on_device) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
}
|
}
|
||||||
func = ggml_cuda_cpy;
|
func = ggml_cuda_cpy;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cuda_dup;
|
||||||
|
break;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
|
|
35
ggml-metal.m
35
ggml-metal.m
|
@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
|
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q2_K ||
|
else if (src0t == GGML_TYPE_Q5_K) {
|
||||||
src0t == GGML_TYPE_Q3_K ||
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
src0t == GGML_TYPE_Q4_K ||
|
}
|
||||||
src0t == GGML_TYPE_Q5_K ||
|
else if (src0t == GGML_TYPE_Q6_K) {
|
||||||
src0t == GGML_TYPE_Q6_K) {
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
}
|
||||||
|
else if (src0t == GGML_TYPE_Q3_K) {
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
} else {
|
} else {
|
||||||
|
@ -792,7 +795,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
const float eps = 1e-6f;
|
const float eps = 1e-6f;
|
||||||
|
|
||||||
const int nth = 256;
|
const int nth = 512;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
|
929
ggml-metal.metal
929
ggml-metal.metal
File diff suppressed because it is too large
Load diff
15
ggml.c
15
ggml.c
|
@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
||||||
if (&g_state.contexts[i].context == ctx) {
|
if (&g_state.contexts[i].context == ctx) {
|
||||||
g_state.contexts[i].used = false;
|
g_state.contexts[i].used = false;
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
||||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
__func__, i, ggml_used_mem(ctx));
|
||||||
|
|
||||||
if (ctx->mem_buffer_owned) {
|
if (ctx->mem_buffer_owned) {
|
||||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||||
|
@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||||
params.nth = n_tasks_arr[node_n];
|
params.nth = n_tasks_arr[node_n];
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
||||||
}
|
}
|
||||||
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||||
}
|
}
|
||||||
|
|
||||||
// distribute new work or execute it direct if 1T
|
// distribute new work or execute it direct if 1T
|
||||||
|
@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||||
params.type = GGML_TASK_FINALIZE;
|
params.type = GGML_TASK_FINALIZE;
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
//assert(cgraph->work == NULL);
|
|
||||||
//assert(cgraph->work_size == 0);
|
|
||||||
|
|
||||||
uint64_t size_eval = 0;
|
uint64_t size_eval = 0;
|
||||||
|
|
||||||
// compute size of intermediate results
|
// compute size of intermediate results
|
||||||
|
@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
|
|
||||||
GGML_PRINT("=== GRAPH ===\n");
|
GGML_PRINT("=== GRAPH ===\n");
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
|
||||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
|
||||||
|
|
||||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
19
llama.cpp
19
llama.cpp
|
@ -555,7 +555,9 @@ struct llama_file_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
// skip to the next multiple of 32 bytes
|
// skip to the next multiple of 32 bytes
|
||||||
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
|
}
|
||||||
|
|
||||||
tensor.file_off = file.tell();
|
tensor.file_off = file.tell();
|
||||||
tensor.name = name;
|
tensor.name = name;
|
||||||
|
@ -875,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_max_devices() {
|
||||||
|
return LLAMA_MAX_DEVICES;
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_mmap_supported() {
|
bool llama_mmap_supported() {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
@ -2024,10 +2030,19 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the second derivatives
|
// Normalize the second derivatives
|
||||||
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
{
|
||||||
|
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||||
|
|
||||||
|
if (second_derivatives_sum > 1e-6f) {
|
||||||
for (float & value : second_derivatives) {
|
for (float & value : second_derivatives) {
|
||||||
value /= second_derivatives_sum;
|
value /= second_derivatives_sum;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (float & value : second_derivatives) {
|
||||||
|
value = 1.0f / second_derivatives.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = candidates->size;
|
size_t last_idx = candidates->size;
|
||||||
|
@ -2205,7 +2220,7 @@ void llama_sample_classifier_free_guidance(
|
||||||
struct llama_context * guidance_ctx,
|
struct llama_context * guidance_ctx,
|
||||||
float scale,
|
float scale,
|
||||||
float smooth_factor) {
|
float smooth_factor) {
|
||||||
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
assert(ctx);
|
assert(ctx);
|
||||||
auto n_vocab = llama_n_vocab(ctx);
|
auto n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -153,6 +153,8 @@ extern "C" {
|
||||||
int32_t n_eval;
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
LLAMA_API int llama_max_devices();
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||||
|
|
||||||
|
|
2
scripts/verify-checksum-models.py
Normal file → Executable file
2
scripts/verify-checksum-models.py
Normal file → Executable file
|
@ -1,3 +1,5 @@
|
||||||
|
#!/bin/env python3
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
function(llama_add_test source)
|
function(llama_add_test source)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_executable(${TEST_TARGET} ${source})
|
add_executable(${TEST_TARGET} ${source})
|
||||||
|
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
|
@ -200,4 +200,6 @@ int main(void) {
|
||||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||||
|
|
||||||
printf("OK\n");
|
printf("OK\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue