Merge upstream changes, fix conflicts, adapt per-layer kv
This commit is contained in:
commit
2f5529e2dc
89 changed files with 9395 additions and 2417 deletions
|
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./main "$@"
|
||||||
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
|
./finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -34,6 +36,8 @@ else
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||||
|
echo " See documentation for finetune for command-line parameters"
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||||
echo " ex: \"/models/\" 7B"
|
echo " ex: \"/models/\" 7B"
|
||||||
echo " --server (-s): Run a model on the server"
|
echo " --server (-s): Run a model on the server"
|
||||||
|
|
26
.github/workflows/build.yml
vendored
26
.github/workflows/build.yml
vendored
|
@ -143,6 +143,9 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose
|
||||||
|
|
||||||
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
|
# how to debug it.
|
||||||
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
@ -160,14 +163,18 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: make_build
|
id: make_build
|
||||||
run: |
|
run: |
|
||||||
make -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: make_test
|
id: make_test
|
||||||
run: |
|
run: |
|
||||||
make tests -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
||||||
make test -j $(sysctl -n hw.logicalcpu)
|
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
|
# how to debug it.
|
||||||
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
|
# would be great if we fix these
|
||||||
macOS-latest-cmake:
|
macOS-latest-cmake:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
@ -188,7 +195,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake -DLLAMA_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -498,6 +505,17 @@ jobs:
|
||||||
path: |
|
path: |
|
||||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
|
ios-xcode-build:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build Xcode project
|
||||||
|
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||||
|
|
||||||
|
|
||||||
# freeBSD-latest:
|
# freeBSD-latest:
|
||||||
# runs-on: macos-12
|
# runs-on: macos-12
|
||||||
# steps:
|
# steps:
|
||||||
|
|
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
name: flake8 Lint
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
flake8-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
name: Lint
|
||||||
|
steps:
|
||||||
|
- name: Check out source repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Set up Python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: flake8 Lint
|
||||||
|
uses: py-actions/flake8@v2
|
||||||
|
with:
|
||||||
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
28
.gitignore
vendored
28
.gitignore
vendored
|
@ -47,6 +47,7 @@ models-mnt
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-bench
|
||||||
/llava-cli
|
/llava-cli
|
||||||
|
/lookahead
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
/perplexity
|
/perplexity
|
||||||
|
@ -64,6 +65,7 @@ models-mnt
|
||||||
/speculative
|
/speculative
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
/vdot
|
/vdot
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
@ -86,15 +88,17 @@ poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
tests/test-grammar-parser
|
/tests/test-grammar-parser
|
||||||
tests/test-llama-grammar
|
/tests/test-llama-grammar
|
||||||
tests/test-double-float
|
/tests/test-double-float
|
||||||
tests/test-grad0
|
/tests/test-grad0
|
||||||
tests/test-opt
|
/tests/test-opt
|
||||||
tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
tests/test-sampling
|
/tests/test-sampling
|
||||||
tests/test-tokenizer-0-llama
|
/tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
/tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-1-llama
|
/tests/test-tokenizer-1-llama
|
||||||
tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
|
/tests/test-rope
|
||||||
|
/tests/test-backend-ops
|
||||||
|
|
|
@ -43,6 +43,7 @@ else()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
|
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
@ -97,9 +98,12 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
|
# Required for relocatable CMake package
|
||||||
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Compile flags
|
# Compile flags
|
||||||
|
@ -113,6 +117,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
include(CheckCXXCompilerFlag)
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
|
# enable libstdc++ assertions for debug builds
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
|
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
add_compile_options(-fsanitize=thread)
|
add_compile_options(-fsanitize=thread)
|
||||||
|
@ -162,7 +171,7 @@ if (LLAMA_METAL)
|
||||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||||
|
|
||||||
# copy ggml-metal.metal to bin directory
|
# copy ggml-metal.metal to bin directory
|
||||||
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||||
${FOUNDATION_LIBRARY}
|
${FOUNDATION_LIBRARY}
|
||||||
|
@ -670,11 +679,11 @@ add_library(ggml OBJECT
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
|
|
41
Makefile
41
Makefile
|
@ -2,13 +2,14 @@
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
|
tests/test-backend-ops
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -30,7 +31,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
|
||||||
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||||
else
|
else
|
||||||
CC_IS_CLANG=1
|
CC_IS_CLANG=1
|
||||||
ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
|
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
|
||||||
CC_IS_LLVM_CLANG=1
|
CC_IS_LLVM_CLANG=1
|
||||||
else
|
else
|
||||||
CC_IS_APPLE_CLANG=1
|
CC_IS_APPLE_CLANG=1
|
||||||
|
@ -174,6 +175,10 @@ ifdef LLAMA_DEBUG
|
||||||
MK_CFLAGS += -O0 -g
|
MK_CFLAGS += -O0 -g
|
||||||
MK_CXXFLAGS += -O0 -g
|
MK_CXXFLAGS += -O0 -g
|
||||||
MK_LDFLAGS += -g
|
MK_LDFLAGS += -g
|
||||||
|
|
||||||
|
ifeq ($(UNAME_S),Linux)
|
||||||
|
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
@ -609,6 +614,9 @@ infill: examples/infill/infill.cpp ggml.o llama.o $(C
|
||||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -660,7 +668,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
|
@ -669,6 +677,9 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
@ -710,28 +721,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
@ -746,5 +757,11 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
||||||
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -2,33 +2,14 @@
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
#if arch(arm) || arch(arm64)
|
|
||||||
let platforms: [SupportedPlatform]? = [
|
|
||||||
.macOS(.v12),
|
|
||||||
.iOS(.v14),
|
|
||||||
.watchOS(.v4),
|
|
||||||
.tvOS(.v14)
|
|
||||||
]
|
|
||||||
let exclude: [String] = []
|
|
||||||
let resources: [Resource] = [
|
|
||||||
.process("ggml-metal.metal")
|
|
||||||
]
|
|
||||||
let additionalSources: [String] = ["ggml-metal.m"]
|
|
||||||
let additionalSettings: [CSetting] = [
|
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
|
||||||
.define("GGML_USE_METAL")
|
|
||||||
]
|
|
||||||
#else
|
|
||||||
let platforms: [SupportedPlatform]? = nil
|
|
||||||
let exclude: [String] = ["ggml-metal.metal"]
|
|
||||||
let resources: [Resource] = []
|
|
||||||
let additionalSources: [String] = []
|
|
||||||
let additionalSettings: [CSetting] = []
|
|
||||||
#endif
|
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
platforms: platforms,
|
platforms: [
|
||||||
|
.macOS(.v12),
|
||||||
|
.iOS(.v14),
|
||||||
|
.watchOS(.v4),
|
||||||
|
.tvOS(.v14)
|
||||||
|
],
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
|
@ -36,25 +17,30 @@ let package = Package(
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: exclude,
|
exclude: [],
|
||||||
sources: [
|
sources: [
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"ggml-backend.c",
|
"ggml-backend.c",
|
||||||
"ggml-quants.c",
|
"ggml-quants.c",
|
||||||
] + additionalSources,
|
"ggml-metal.m",
|
||||||
resources: resources,
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
|
],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.define("GGML_USE_METAL"),
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
// .define("ACCELERATE_NEW_LAPACK"),
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
// .define("ACCELERATE_LAPACK_ILP64")
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
] + additionalSettings,
|
],
|
||||||
linkerSettings: [
|
linkerSettings: [
|
||||||
.linkedFramework("Accelerate")
|
.linkedFramework("Accelerate")
|
||||||
]
|
]
|
||||||
|
|
24
README.md
24
README.md
|
@ -10,7 +10,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- *No hot topics atm. Open to suggestions about what is hot today*
|
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
|
||||||
|
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||||
|
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||||
|
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
@ -114,6 +117,8 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -320,7 +325,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
||||||
- #### Accelerate Framework:
|
- #### Accelerate Framework:
|
||||||
|
|
||||||
|
@ -410,19 +415,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||||
Windows support is coming soon...
|
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make LLAMA_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake` for Linux:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
||||||
cmake --build .
|
cmake --build .
|
||||||
```
|
```
|
||||||
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
|
||||||
|
```bash
|
||||||
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
|
||||||
|
|
||||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
||||||
|
@ -883,7 +897,7 @@ Additionally, there the following images, similar to the above:
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
#### Usage
|
#### Usage
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
|
||||||
|
if (SLASH_POS EQUAL 0)
|
||||||
|
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||||
|
else()
|
||||||
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GIT_INDEX "${GIT_DIR}/index")
|
set(GIT_INDEX "${GIT_DIR}/index")
|
||||||
|
@ -26,7 +31,7 @@ add_custom_command(
|
||||||
COMMENT "Generating build details from Git"
|
COMMENT "Generating build details from Git"
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -277,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "--samplers") {
|
||||||
params.memory_f16 = false;
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
||||||
|
} else if (arg == "--sampling-seq") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.samplers_sequence = argv[i];
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -491,8 +502,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
|
} else if (arg == "-cml" || arg == "--chatml") {
|
||||||
|
params.chatml = true;
|
||||||
} else if (arg == "--infill") {
|
} else if (arg == "--infill") {
|
||||||
params.infill = true;
|
params.infill = true;
|
||||||
|
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||||
|
params.dump_kv_cache = true;
|
||||||
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||||
|
params.no_kv_offload = true;
|
||||||
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||||
|
params.cache_type_k = argv[++i];
|
||||||
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
|
params.cache_type_v = argv[++i];
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
params.multiline_input = true;
|
params.multiline_input = true;
|
||||||
} else if (arg == "--simple-io") {
|
} else if (arg == "--simple-io") {
|
||||||
|
@ -673,6 +694,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
std::istreambuf_iterator<char>(),
|
std::istreambuf_iterator<char>(),
|
||||||
std::back_inserter(sparams.grammar)
|
std::back_inserter(sparams.grammar)
|
||||||
);
|
);
|
||||||
|
} else if (arg == "--override-kv") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
char * sep = strchr(argv[i], '=');
|
||||||
|
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||||
|
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct llama_model_kv_override kvo;
|
||||||
|
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||||
|
kvo.key[sep - argv[i]] = 0;
|
||||||
|
sep++;
|
||||||
|
if (strncmp(sep, "int:", 4) == 0) {
|
||||||
|
sep += 4;
|
||||||
|
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||||
|
kvo.int_value = std::atol(sep);
|
||||||
|
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||||
|
sep += 6;
|
||||||
|
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||||
|
kvo.float_value = std::atof(sep);
|
||||||
|
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||||
|
sep += 5;
|
||||||
|
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||||
|
if (std::strcmp(sep, "true") == 0) {
|
||||||
|
kvo.bool_value = true;
|
||||||
|
} else if (std::strcmp(sep, "false") == 0) {
|
||||||
|
kvo.bool_value = false;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.kv_overrides.push_back(kvo);
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
// Parse args for logging parameters
|
// Parse args for logging parameters
|
||||||
} else if ( log_param_single_parse( argv[i] ) ) {
|
} else if ( log_param_single_parse( argv[i] ) ) {
|
||||||
|
@ -716,6 +778,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!params.kv_overrides.empty()) {
|
||||||
|
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||||
|
params.kv_overrides.back().key[0] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -730,6 +797,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||||
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||||
|
@ -755,6 +823,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
||||||
|
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
|
@ -792,8 +862,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
|
@ -832,6 +900,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
|
printf(" -dkvc, --dump-kv-cache\n");
|
||||||
|
printf(" verbose print of the KV cache\n");
|
||||||
|
printf(" -nkvo, --no-kv-offload\n");
|
||||||
|
printf(" disable KV offload\n");
|
||||||
|
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||||
|
printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
|
||||||
|
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||||
|
printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
|
||||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
|
@ -842,6 +918,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||||
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||||
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_print_usage();
|
log_print_usage();
|
||||||
|
@ -878,6 +957,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
GGML_UNREACHABLE();
|
GGML_UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// String parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
std::string parse_samplers_input(std::string input) {
|
||||||
|
std::string output = "";
|
||||||
|
// since samplers names are written multiple ways
|
||||||
|
// make it ready for both system names and input names
|
||||||
|
std::unordered_map<std::string, char> samplers_symbols {
|
||||||
|
{"top_k", 'k'},
|
||||||
|
{"top-k", 'k'},
|
||||||
|
{"top_p", 'p'},
|
||||||
|
{"top-p", 'p'},
|
||||||
|
{"nucleus", 'p'},
|
||||||
|
{"typical_p", 'y'},
|
||||||
|
{"typical-p", 'y'},
|
||||||
|
{"typical", 'y'},
|
||||||
|
{"min_p", 'm'},
|
||||||
|
{"min-p", 'm'},
|
||||||
|
{"tfs_z", 'f'},
|
||||||
|
{"tfs-z", 'f'},
|
||||||
|
{"tfs", 'f'},
|
||||||
|
{"temp", 't'},
|
||||||
|
{"temperature",'t'}
|
||||||
|
};
|
||||||
|
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
||||||
|
size_t separator = input.find(';');
|
||||||
|
while (separator != input.npos) {
|
||||||
|
std::string name = input.substr(0,separator);
|
||||||
|
input = input.substr(separator+1);
|
||||||
|
separator = input.find(';');
|
||||||
|
|
||||||
|
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
||||||
|
output += samplers_symbols[name];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
||||||
|
output += samplers_symbols[input];
|
||||||
|
}
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
@ -892,10 +1013,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
mparams.tensor_split = params.tensor_split;
|
mparams.tensor_split = params.tensor_split;
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
|
if (params.kv_overrides.empty()) {
|
||||||
|
mparams.kv_overrides = NULL;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
||||||
|
mparams.kv_overrides = params.kv_overrides.data();
|
||||||
|
}
|
||||||
|
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
|
if (s == "f16") {
|
||||||
|
return GGML_TYPE_F16;
|
||||||
|
}
|
||||||
|
if (s == "q8_0") {
|
||||||
|
return GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_0") {
|
||||||
|
return GGML_TYPE_Q4_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_1") {
|
||||||
|
return GGML_TYPE_Q4_1;
|
||||||
|
}
|
||||||
|
if (s == "q5_0") {
|
||||||
|
return GGML_TYPE_Q5_0;
|
||||||
|
}
|
||||||
|
if (s == "q5_1") {
|
||||||
|
return GGML_TYPE_Q5_1;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw std::runtime_error("Invalid cache type: " + s);
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
|
@ -905,7 +1055,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
cparams.mul_mat_q = params.mul_mat_q;
|
cparams.mul_mat_q = params.mul_mat_q;
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.f16_kv = params.memory_f16;
|
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embedding = params.embedding;
|
cparams.embedding = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
|
@ -916,6 +1065,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -931,7 +1084,7 @@ void llama_batch_add(
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits) {
|
bool logits) {
|
||||||
batch.token [batch.n_tokens] = id;
|
batch.token [batch.n_tokens] = id;
|
||||||
batch.pos [batch.n_tokens] = pos,
|
batch.pos [batch.n_tokens] = pos;
|
||||||
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||||
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||||
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||||
|
@ -1328,7 +1481,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
}
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
|
@ -1383,3 +1535,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
int seq_count = 0;
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) { seq_count++; }
|
||||||
|
}
|
||||||
|
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] < 0) { continue; }
|
||||||
|
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
seqs[cs_curr[j]] = seqs.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("=== Sequence legend: ");
|
||||||
|
for (const auto & it : seqs) {
|
||||||
|
printf("%zu=%d, ", it.second, it.first);
|
||||||
|
}
|
||||||
|
printf("'+'=other sequence ids");
|
||||||
|
|
||||||
|
c_curr = view.cells;
|
||||||
|
cs_curr = view.cells_sequences;
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) {
|
||||||
|
const auto & it = seqs.find(cs_curr[j]);
|
||||||
|
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
||||||
|
} else {
|
||||||
|
putchar('.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
putchar(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
|
|
@ -86,6 +86,8 @@ struct gpt_params {
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
|
|
||||||
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
// TODO: avoid tuple, use struct
|
// TODO: avoid tuple, use struct
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
@ -98,10 +100,10 @@ struct gpt_params {
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
@ -121,10 +123,15 @@ struct gpt_params {
|
||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
|
||||||
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::string image = ""; // path to an image file
|
std::string image = ""; // path to an image file
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||||
|
@ -139,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
|
//
|
||||||
|
// String parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
std::string parse_samplers_input(std::string input);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
@ -217,3 +230,13 @@ std::string get_sortable_timestamp();
|
||||||
void dump_non_result_info_yaml(
|
void dump_non_result_info_yaml(
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
|
@ -190,7 +190,7 @@ namespace grammar_parser {
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
||||||
if (last_sym_start == out_elements.size()) {
|
if (last_sym_start == out_elements.size()) {
|
||||||
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
|
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||||
|
|
|
@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
|
std::string result = "CFG -> Penalties ";
|
||||||
|
if (params.mirostat == 0) {
|
||||||
|
for (auto s : params.samplers_sequence) {
|
||||||
|
switch (s) {
|
||||||
|
case 'k': result += "-> top_k "; break;
|
||||||
|
case 'f': result += "-> tfs_z "; break;
|
||||||
|
case 'y': result += "-> typical_p "; break;
|
||||||
|
case 'p': result += "-> top_p "; break;
|
||||||
|
case 'm': result += "-> min_p "; break;
|
||||||
|
case 't': result += "-> temp "; break;
|
||||||
|
default : break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result += "-> mirostat ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no reasons to expose this function in header
|
||||||
|
static void sampler_queue(
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
const llama_sampling_params & params,
|
||||||
|
llama_token_data_array & cur_p,
|
||||||
|
size_t & min_keep) {
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
|
const float temp = params.temp;
|
||||||
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
|
const float tfs_z = params.tfs_z;
|
||||||
|
const float typical_p = params.typical_p;
|
||||||
|
const std::string & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
|
for (auto s : samplers_sequence) {
|
||||||
|
switch (s){
|
||||||
|
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||||
|
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||||
|
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
|
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
|
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
|
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||||
|
default : break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
llama_token llama_sampling_sample(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float min_p = params.min_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
const float penalty_repeat = params.penalty_repeat;
|
const float penalty_repeat = params.penalty_repeat;
|
||||||
const float penalty_freq = params.penalty_freq;
|
const float penalty_freq = params.penalty_freq;
|
||||||
|
@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
|
||||||
// temperature sampling
|
// temperature sampling
|
||||||
size_t min_keep = std::max(1, params.n_probs);
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
|
|
||||||
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
|
||||||
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
|
||||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
|
||||||
|
|
|
@ -10,22 +10,23 @@
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
typedef struct llama_sampling_params {
|
typedef struct llama_sampling_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
|
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
|
||||||
// Print sampling parameters into a string
|
// Print sampling parameters into a string
|
||||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
|
// Print sampling order into a string
|
||||||
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
// this is a common sampling function used across the examples for convenience
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
|
|
@ -1,315 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# HF baichuan --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing import TypeAlias
|
|
||||||
|
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|
||||||
|
|
||||||
# reverse HF permute back to original pth layout
|
|
||||||
|
|
||||||
|
|
||||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
|
||||||
n_head //= n_kv_head
|
|
||||||
|
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(weights.shape))
|
|
||||||
|
|
||||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
|
||||||
|
|
||||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return weights[r * n_part : r * n_part + r, ...]
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: str) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
endianess = gguf.GGUFEndian.LITTLE
|
|
||||||
if args.bigendian:
|
|
||||||
endianess = gguf.GGUFEndian.BIG
|
|
||||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
|
||||||
print(f"gguf: Conversion Endianess {endianess}")
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print("hello print: ",hparams["architectures"][0])
|
|
||||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
print(f"num_parts:{num_parts}\n")
|
|
||||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
|
||||||
head_count = hparams["num_attention_heads"]
|
|
||||||
|
|
||||||
if "num_key_value_heads" in hparams:
|
|
||||||
head_count_kv = hparams["num_key_value_heads"]
|
|
||||||
else:
|
|
||||||
head_count_kv = head_count
|
|
||||||
|
|
||||||
if "_name_or_path" in hparams:
|
|
||||||
hf_repo = hparams["_name_or_path"]
|
|
||||||
else:
|
|
||||||
hf_repo = ""
|
|
||||||
|
|
||||||
if "max_sequence_length" in hparams:
|
|
||||||
ctx_length = hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in hparams:
|
|
||||||
ctx_length = hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in hparams:
|
|
||||||
ctx_length = hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
print("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
|
||||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
||||||
gguf_writer.add_context_length(ctx_length)
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
gguf_writer.add_head_count(head_count)
|
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
|
||||||
|
|
||||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
|
||||||
if "type" in hparams["rope_scaling"]:
|
|
||||||
if hparams["rope_scaling"]["type"] == "linear":
|
|
||||||
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytes] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
|
||||||
if not tokenizer_model_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# vocab type sentencepiece
|
|
||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
|
||||||
vocab_size = hparams.get('vocab_size')
|
|
||||||
if vocab_size is None:
|
|
||||||
vocab_size = tokenizer.vocab_size()
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
text: bytes
|
|
||||||
score: float
|
|
||||||
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = 1 # defualt to normal token type
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = 2
|
|
||||||
if tokenizer.is_control(i):
|
|
||||||
toktype = 3
|
|
||||||
|
|
||||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
|
||||||
toktype = 5
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
toktype = 6
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
|
|
||||||
added_tokens_file = dir_model / 'added_tokens.json'
|
|
||||||
if added_tokens_file.is_file():
|
|
||||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
|
||||||
addtokens_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get added tokens")
|
|
||||||
|
|
||||||
for key in addtokens_json:
|
|
||||||
tokens.append( key.encode("utf-8") )
|
|
||||||
scores.append(-1000.0)
|
|
||||||
toktypes.append(4) # user-defined token type
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
tmp=model_part
|
|
||||||
for i in range(block_count):
|
|
||||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
|
||||||
print(f"Unpacking and permuting layer {i}")
|
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
|
||||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
# we don't need these
|
|
||||||
if name.endswith(".rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
|
@ -10,7 +10,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -59,7 +59,7 @@ class Model:
|
||||||
from safetensors import safe_open
|
from safetensors import safe_open
|
||||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||||
else:
|
else:
|
||||||
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
|
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||||
|
|
||||||
with ctx as model_part:
|
with ctx as model_part:
|
||||||
for name in model_part.keys():
|
for name in model_part.keys():
|
||||||
|
@ -168,6 +168,8 @@ class Model:
|
||||||
return PersimmonModel
|
return PersimmonModel
|
||||||
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return StableLMModel
|
return StableLMModel
|
||||||
|
if model_architecture == "QWenLMHeadModel":
|
||||||
|
return QwenModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -203,6 +205,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.PERSIMMON
|
return gguf.MODEL_ARCH.PERSIMMON
|
||||||
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return gguf.MODEL_ARCH.STABLELM
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
|
if arch == "QWenLMHeadModel":
|
||||||
|
return gguf.MODEL_ARCH.QWEN
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -827,13 +831,139 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
class QwenModel(Model):
|
||||||
|
@staticmethod
|
||||||
|
def token_bytes_to_string(b):
|
||||||
|
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||||
|
byte_encoder = bytes_to_unicode()
|
||||||
|
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
|
||||||
|
parts = [bytes([b]) for b in token]
|
||||||
|
while True:
|
||||||
|
min_idx = None
|
||||||
|
min_rank = None
|
||||||
|
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
||||||
|
rank = mergeable_ranks.get(pair[0] + pair[1])
|
||||||
|
if rank is not None and (min_rank is None or rank < min_rank):
|
||||||
|
min_idx = i
|
||||||
|
min_rank = rank
|
||||||
|
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
||||||
|
break
|
||||||
|
assert min_idx is not None
|
||||||
|
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
dir_model = self.dir_model
|
||||||
|
hparams = self.hparams
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer # type: ignore[attr-defined]
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||||
|
vocab_size = hparams["vocab_size"]
|
||||||
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||||
|
|
||||||
|
merges = []
|
||||||
|
vocab = {}
|
||||||
|
mergeable_ranks = tokenizer.mergeable_ranks
|
||||||
|
for token, rank in mergeable_ranks.items():
|
||||||
|
vocab[self.token_bytes_to_string(token)] = rank
|
||||||
|
if len(token) == 1:
|
||||||
|
continue
|
||||||
|
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
|
assert len(merged) == 2
|
||||||
|
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
|
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||||
|
added_vocab = tokenizer.special_tokens
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||||
|
tokens.append(bytearray(pad_token))
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
elif reverse_vocab[i] in added_vocab:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||||
|
special_vocab.merges = merges
|
||||||
|
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("Qwen")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
model_kv = dict(self.get_tensors())
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
for name, data_torch in model_kv.items():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith(".rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -879,20 +1009,21 @@ print(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
with torch.inference_mode():
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||||
|
|
||||||
print("Set model parameters")
|
print("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
|
||||||
print("Set model tokenizer")
|
print("Set model tokenizer")
|
||||||
model_instance.set_vocab()
|
model_instance.set_vocab()
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
print(f"Exporting model vocab to '{fname_out}'")
|
print(f"Exporting model vocab to '{fname_out}'")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
else:
|
else:
|
||||||
print(f"Exporting model to '{fname_out}'")
|
print(f"Exporting model to '{fname_out}'")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
|
||||||
print(f"Model successfully exported to '{fname_out}'")
|
print(f"Model successfully exported to '{fname_out}'")
|
||||||
|
|
|
@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
class GGMLFormat(IntEnum):
|
class GGMLFormat(IntEnum):
|
||||||
GGML = 0
|
GGML = 0
|
||||||
GGMF = 1
|
GGMF = 1
|
||||||
GGJT = 2
|
GGJT = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLFType(IntEnum):
|
class GGMLFType(IntEnum):
|
||||||
ALL_F32 = 0
|
ALL_F32 = 0
|
||||||
MOSTLY_F16 = 1
|
MOSTLY_F16 = 1
|
||||||
|
@ -38,6 +40,7 @@ class GGMLFType(IntEnum):
|
||||||
MOSTLY_Q5_K_M = 17
|
MOSTLY_Q5_K_M = 17
|
||||||
MOSTLY_Q6_K = 18
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
|
@ -69,6 +72,7 @@ class Hyperparameters:
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self, load_scores = True):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
|
@ -90,6 +94,7 @@ class Vocab:
|
||||||
self.items.append((item_text, item_score))
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, use_padding = True):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
|
@ -123,6 +128,7 @@ class Tensor:
|
||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
|
@ -159,8 +165,8 @@ class GGMLModel:
|
||||||
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
if len(err) > 0:
|
if len(err) > 0:
|
||||||
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||||
|
@ -187,6 +193,7 @@ class GGMLModel:
|
||||||
hp.set_n_ff(self)
|
hp.set_n_ff(self)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
|
@ -217,7 +224,7 @@ class GGMLToGGUF:
|
||||||
gguf_writer = gguf.GGUFWriter(
|
gguf_writer = gguf.GGUFWriter(
|
||||||
self.cfg.output,
|
self.cfg.output,
|
||||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
use_temp_file = False )
|
use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
|
@ -341,7 +348,8 @@ class GGMLToGGUF:
|
||||||
mapped_name,
|
mapped_name,
|
||||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
raw_shape = tempdims,
|
raw_shape = tempdims,
|
||||||
raw_dtype = tensor.dtype )
|
raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
|
@ -365,38 +373,40 @@ def handle_metadata(cfg, hp):
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(
|
vocab = convert.load_vocab(
|
||||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
cfg.vocabtype )
|
cfg.vocabtype)
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||||
load_merges = cfg.vocabtype == 'bpe',
|
load_merges = cfg.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
help = 'Input GGMLv3 filename')
|
help = 'Input GGMLv3 filename')
|
||||||
parser.add_argument('--output', '-o', type = Path, required = True,
|
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||||
help ='Output GGUF filename')
|
help ='Output GGUF filename')
|
||||||
parser.add_argument('--name',
|
parser.add_argument('--name',
|
||||||
help = 'Set model name')
|
help = 'Set model name')
|
||||||
parser.add_argument('--desc',
|
parser.add_argument('--desc',
|
||||||
help = 'Set model description')
|
help = 'Set model description')
|
||||||
parser.add_argument('--gqa', type = int, default = 1,
|
parser.add_argument('--gqa', type = int, default = 1,
|
||||||
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
parser.add_argument('--eps', default = '5.0e-06',
|
parser.add_argument('--eps', default = '5.0e-06',
|
||||||
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||||
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||||
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
parser.add_argument("--vocab-dir", type=Path,
|
parser.add_argument("--vocab-dir", type=Path,
|
||||||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
|
@ -406,7 +416,7 @@ def main():
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLModel()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0) # noqa
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
|
@ -421,12 +431,15 @@ def main():
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
if model.file_format == GGMLFormat.GGML:
|
if model.file_format == GGMLFormat.GGML:
|
||||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
converter = GGMLToGGUF(model, data, cfg,
|
converter = GGMLToGGUF(
|
||||||
|
model, data, cfg,
|
||||||
params_override = params_override,
|
params_override = params_override,
|
||||||
vocab_override = vocab_override,
|
vocab_override = vocab_override,
|
||||||
special_vocab = special_vocab )
|
special_vocab = special_vocab
|
||||||
|
)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
assert isinstance(dct, dict)
|
assert isinstance(dct, dict)
|
||||||
for key in dct.keys():
|
for key in dct.keys():
|
||||||
|
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
||||||
raise ValueError(type(dct[key]))
|
raise ValueError(type(dct[key]))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
|
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
pass
|
pass
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
@ -125,6 +128,5 @@ def main():
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
62
convert.py
62
convert.py
|
@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
|
@ -55,15 +56,18 @@ class DataType:
|
||||||
def elements_to_bytes(self, n_elements: int) -> int:
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
return n_elements * self.dtype.itemsize
|
return n_elements * self.dtype.itemsize
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType(DataType):
|
class UnquantizedDataType(DataType):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class QuantizedDataType(DataType):
|
class QuantizedDataType(DataType):
|
||||||
block_size: int
|
block_size: int
|
||||||
|
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
|
||||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
# Mini Q8_0 quantization in Python!
|
# Mini Q8_0 quantization in Python!
|
||||||
|
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
n_blocks = arr.size // self.block_size
|
n_blocks = arr.size // self.block_size
|
||||||
blocks = arr.reshape((n_blocks, self.block_size))
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
|
||||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
with np.errstate(divide = 'ignore'):
|
with np.errstate(divide = 'ignore'):
|
||||||
|
@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
yield from zip(d, qs)
|
yield from zip(d, qs)
|
||||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
|
||||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||||
|
|
||||||
# Quantized types skipped here because they may also map to np.float32
|
# Quantized types skipped here because they may also map to np.float32
|
||||||
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
||||||
|
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
||||||
# TODO: match this with `llama_ftype`
|
# TODO: match this with `llama_ftype`
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
|
|
||||||
|
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
|
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
||||||
# 1D tensors are always F32.
|
# 1D tensors are always F32.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
GGMLFileType.AllF32 : DT_F32,
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
GGMLFileType.MostlyF16 : DT_F16,
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
|
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
# hparams loading
|
# hparams loading
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
|
@ -167,11 +178,11 @@ class Params:
|
||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
@ -256,7 +267,7 @@ class Params:
|
||||||
n_ctx = 2048
|
n_ctx = 2048
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd = config["dim"],
|
n_embd = config["dim"],
|
||||||
n_layer = config["n_layers"],
|
n_layer = config["n_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
|
@ -308,7 +319,7 @@ class BpeVocab:
|
||||||
(item['content'], item['id'])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in self.bpe_tokenizer )
|
if item['content'] not in self.bpe_tokenizer)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.bpe_tokenizer)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
@ -326,7 +337,6 @@ class BpeVocab:
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
|
@ -406,6 +416,7 @@ class SentencePieceVocab:
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
n_head = n_head_kv
|
n_head = n_head_kv
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
|
||||||
class Tensor(metaclass=ABCMeta):
|
class Tensor(metaclass=ABCMeta):
|
||||||
|
@ -500,7 +512,7 @@ class LazyTensor:
|
||||||
ret = self._load()
|
ret = self._load()
|
||||||
# Should be okay if it maps to the same numpy type?
|
# Should be okay if it maps to the same numpy type?
|
||||||
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||||
(self.data_type, ret.data_type, self.description)
|
(self.data_type, ret.data_type, self.description)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> LazyTensor:
|
def astype(self, data_type: DataType) -> LazyTensor:
|
||||||
|
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
||||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||||
|
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
||||||
s[0] = s[0] // 3
|
s[0] = s[0] // 3
|
||||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().part(n_part)
|
return lazy_tensor.load().part(n_part)
|
||||||
|
@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
|
@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
|
@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
@ -876,7 +892,7 @@ class OutputFile:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
@ -938,8 +954,9 @@ class OutputFile:
|
||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
|
@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
|
@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
print(f"Permuting layer {i}")
|
print(f"Permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
print(f"Unpacking and permuting layer {i}")
|
print(f"Unpacking and permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||||
|
@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
|
@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = convert_model_names(model, params)
|
model = convert_model_names(model, params)
|
||||||
|
|
BIN
docs/llama-star/idea-arch.key
Executable file
BIN
docs/llama-star/idea-arch.key
Executable file
Binary file not shown.
BIN
docs/llama-star/idea-arch.pdf
Normal file
BIN
docs/llama-star/idea-arch.pdf
Normal file
Binary file not shown.
|
@ -32,6 +32,7 @@ else()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
|
|
|
@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -153,7 +153,7 @@ while n_cur <= n_len {
|
||||||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
if new_token_id == llama_token_eos(model) || n_cur == n_len {
|
||||||
i_batch[i] = -1
|
i_batch[i] = -1
|
||||||
// print("")
|
// print("")
|
||||||
if n_parallel > 1 {
|
if n_parallel > 1 {
|
||||||
|
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
|
||||||
llama_print_timings(context)
|
llama_print_timings(context)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
let utf8Count = text.utf8.count
|
||||||
|
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||||
var swiftTokens: [llama_token] = []
|
var swiftTokens: [llama_token] = []
|
||||||
for i in 0 ..< tokenCount {
|
for i in 0 ..< tokenCount {
|
||||||
swiftTokens.append(tokens[Int(i)])
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
@ -230,18 +231,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
|
||||||
var result = [CChar](repeating: 0, count: 8)
|
var result = [CChar](repeating: 0, count: 8)
|
||||||
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
||||||
if nTokens < 0 {
|
if nTokens < 0 {
|
||||||
if result.count >= -Int(nTokens) {
|
let actualTokensCount = -Int(nTokens)
|
||||||
result.removeLast(-Int(nTokens))
|
result = .init(repeating: 0, count: actualTokensCount)
|
||||||
} else {
|
|
||||||
result.removeAll()
|
|
||||||
}
|
|
||||||
let check = llama_token_to_piece(
|
let check = llama_token_to_piece(
|
||||||
model,
|
model,
|
||||||
token,
|
token,
|
||||||
&result,
|
&result,
|
||||||
Int32(result.count)
|
Int32(result.count)
|
||||||
)
|
)
|
||||||
assert(check == nTokens)
|
assert(check == actualTokensCount)
|
||||||
} else {
|
} else {
|
||||||
result.removeLast(result.count - Int(nTokens))
|
result.removeLast(result.count - Int(nTokens))
|
||||||
}
|
}
|
||||||
|
@ -259,5 +257,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
|
||||||
buffer = []
|
buffer = []
|
||||||
return bufferString
|
return bufferString
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
Finetune output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
||||||
So in above example after 10 iterations these files will be written:
|
So in above example after 10 iterations these files will be written:
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
||||||
|
|
|
@ -146,6 +146,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (params.chatml) {
|
||||||
|
printf("\n************\n");
|
||||||
|
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||||
|
printf("************\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
printf("\n************\n");
|
printf("\n************\n");
|
||||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||||
|
|
|
@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
|
||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T, typename F>
|
||||||
|
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
||||||
|
std::vector<std::string> str_values;
|
||||||
|
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
||||||
|
return str_values;
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static T avg(const std::vector<T> & v) {
|
static T avg(const std::vector<T> & v) {
|
||||||
if (v.empty()) {
|
if (v.empty()) {
|
||||||
|
@ -126,7 +133,8 @@ struct cmd_params {
|
||||||
std::vector<int> n_prompt;
|
std::vector<int> n_prompt;
|
||||||
std::vector<int> n_gen;
|
std::vector<int> n_gen;
|
||||||
std::vector<int> n_batch;
|
std::vector<int> n_batch;
|
||||||
std::vector<bool> f32_kv;
|
std::vector<ggml_type> type_k;
|
||||||
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
|
@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
/* n_batch */ {512},
|
/* n_batch */ {512},
|
||||||
/* f32_kv */ {false},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
|
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
|
@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
||||||
|
if (s == "f16") {
|
||||||
|
return GGML_TYPE_F16;
|
||||||
|
}
|
||||||
|
if (s == "q8_0") {
|
||||||
|
return GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_0") {
|
||||||
|
return GGML_TYPE_Q4_0;
|
||||||
|
}
|
||||||
|
if (s == "q4_1") {
|
||||||
|
return GGML_TYPE_Q4_1;
|
||||||
|
}
|
||||||
|
if (s == "q5_0") {
|
||||||
|
return GGML_TYPE_Q5_0;
|
||||||
|
}
|
||||||
|
if (s == "q5_1") {
|
||||||
|
return GGML_TYPE_Q5_1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return GGML_TYPE_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
cmd_params params;
|
cmd_params params;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
|
@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<int>(argv[i], split_delim);
|
||||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<std::string>(argv[i], split_delim);
|
||||||
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
|
std::vector<ggml_type> types;
|
||||||
|
for (const auto & t : p) {
|
||||||
|
ggml_type gt = ggml_type_from_name(t);
|
||||||
|
if (gt == GGML_TYPE_COUNT) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
types.push_back(gt);
|
||||||
|
}
|
||||||
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
||||||
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = split<std::string>(argv[i], split_delim);
|
||||||
|
std::vector<ggml_type> types;
|
||||||
|
for (const auto & t : p) {
|
||||||
|
ggml_type gt = ggml_type_from_name(t);
|
||||||
|
if (gt == GGML_TYPE_COUNT) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
types.push_back(gt);
|
||||||
|
}
|
||||||
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
||||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
||||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
||||||
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||||
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||||
|
@ -336,7 +395,8 @@ struct cmd_params_instance {
|
||||||
int n_prompt;
|
int n_prompt;
|
||||||
int n_gen;
|
int n_gen;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
bool f32_kv;
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
|
@ -365,7 +425,8 @@ struct cmd_params_instance {
|
||||||
|
|
||||||
cparams.n_ctx = n_prompt + n_gen;
|
cparams.n_ctx = n_prompt + n_gen;
|
||||||
cparams.n_batch = n_batch;
|
cparams.n_batch = n_batch;
|
||||||
cparams.f16_kv = !f32_kv;
|
cparams.type_k = type_k;
|
||||||
|
cparams.type_v = type_v;
|
||||||
cparams.mul_mat_q = mul_mat_q;
|
cparams.mul_mat_q = mul_mat_q;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
|
@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & fk : params.f32_kv)
|
for (const auto & tk : params.type_k)
|
||||||
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
cmd_params_instance instance = {
|
cmd_params_instance instance = {
|
||||||
|
@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
/* .n_prompt = */ n_prompt,
|
/* .n_prompt = */ n_prompt,
|
||||||
/* .n_gen = */ n_gen,
|
/* .n_gen = */ n_gen,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & fk : params.f32_kv)
|
for (const auto & tk : params.type_k)
|
||||||
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
|
@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_prompt = */ n_prompt,
|
/* .n_prompt = */ n_prompt,
|
||||||
/* .n_gen = */ 0,
|
/* .n_gen = */ 0,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_prompt = */ 0,
|
/* .n_prompt = */ 0,
|
||||||
/* .n_gen = */ n_gen,
|
/* .n_gen = */ n_gen,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
@ -490,7 +556,8 @@ struct test {
|
||||||
uint64_t model_n_params;
|
uint64_t model_n_params;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool f32_kv;
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
|
@ -509,7 +576,8 @@ struct test {
|
||||||
model_n_params = llama_model_n_params(lmodel);
|
model_n_params = llama_model_n_params(lmodel);
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
f32_kv = inst.f32_kv;
|
type_k = inst.type_k;
|
||||||
|
type_v = inst.type_v;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
mul_mat_q = inst.mul_mat_q;
|
mul_mat_q = inst.mul_mat_q;
|
||||||
|
@ -575,7 +643,7 @@ struct test {
|
||||||
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "f16_kv",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
|
@ -625,7 +693,7 @@ struct test {
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
|
@ -810,8 +878,11 @@ struct markdown_printer : public printer {
|
||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.push_back("n_batch");
|
fields.push_back("n_batch");
|
||||||
}
|
}
|
||||||
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
|
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
||||||
fields.push_back("f16_kv");
|
fields.push_back("type_k");
|
||||||
|
}
|
||||||
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
||||||
|
fields.push_back("type_v");
|
||||||
}
|
}
|
||||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||||
fields.push_back("main_gpu");
|
fields.push_back("main_gpu");
|
||||||
|
|
1
examples/llama.swiftui/.gitignore
vendored
Normal file
1
examples/llama.swiftui/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
xcuserdata
|
7
examples/llama.swiftui/README.md
Normal file
7
examples/llama.swiftui/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# llama.swiftui
|
||||||
|
|
||||||
|
Local inference of llama.cpp on an iPhone.
|
||||||
|
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
|
||||||
|
|
||||||
|
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
|
||||||
|
|
208
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
208
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
|
@ -0,0 +1,208 @@
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
// import llama
|
||||||
|
|
||||||
|
enum LlamaError: Error {
|
||||||
|
case couldNotInitializeContext
|
||||||
|
}
|
||||||
|
|
||||||
|
actor LlamaContext {
|
||||||
|
private var model: OpaquePointer
|
||||||
|
private var context: OpaquePointer
|
||||||
|
private var batch: llama_batch
|
||||||
|
private var tokens_list: [llama_token]
|
||||||
|
/// This variable is used to store temporarily invalid cchars
|
||||||
|
private var temporary_invalid_cchars: [CChar]
|
||||||
|
|
||||||
|
var n_len: Int32 = 512
|
||||||
|
var n_cur: Int32 = 0
|
||||||
|
var n_decode: Int32 = 0
|
||||||
|
|
||||||
|
init(model: OpaquePointer, context: OpaquePointer) {
|
||||||
|
self.model = model
|
||||||
|
self.context = context
|
||||||
|
self.tokens_list = []
|
||||||
|
self.batch = llama_batch_init(512, 0, 1)
|
||||||
|
self.temporary_invalid_cchars = []
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit {
|
||||||
|
llama_free(context)
|
||||||
|
llama_free_model(model)
|
||||||
|
llama_backend_free()
|
||||||
|
}
|
||||||
|
|
||||||
|
static func createContext(path: String) throws -> LlamaContext {
|
||||||
|
llama_backend_init(false)
|
||||||
|
let model_params = llama_model_default_params()
|
||||||
|
|
||||||
|
let model = llama_load_model_from_file(path, model_params)
|
||||||
|
guard let model else {
|
||||||
|
print("Could not load model at \(path)")
|
||||||
|
throw LlamaError.couldNotInitializeContext
|
||||||
|
}
|
||||||
|
var ctx_params = llama_context_default_params()
|
||||||
|
ctx_params.seed = 1234
|
||||||
|
ctx_params.n_ctx = 2048
|
||||||
|
ctx_params.n_threads = 8
|
||||||
|
ctx_params.n_threads_batch = 8
|
||||||
|
|
||||||
|
let context = llama_new_context_with_model(model, ctx_params)
|
||||||
|
guard let context else {
|
||||||
|
print("Could not load context!")
|
||||||
|
throw LlamaError.couldNotInitializeContext
|
||||||
|
}
|
||||||
|
|
||||||
|
return LlamaContext(model: model, context: context)
|
||||||
|
}
|
||||||
|
|
||||||
|
func get_n_tokens() -> Int32 {
|
||||||
|
return batch.n_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
func completion_init(text: String) {
|
||||||
|
print("attempting to complete \"\(text)\"")
|
||||||
|
|
||||||
|
tokens_list = tokenize(text: text, add_bos: true)
|
||||||
|
temporary_invalid_cchars = []
|
||||||
|
|
||||||
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||||
|
|
||||||
|
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
|
||||||
|
|
||||||
|
if n_kv_req > n_ctx {
|
||||||
|
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
|
||||||
|
}
|
||||||
|
|
||||||
|
for id in tokens_list {
|
||||||
|
print(String(cString: token_to_piece(token: id) + [0]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// batch = llama_batch_init(512, 0) // done in init()
|
||||||
|
batch.n_tokens = Int32(tokens_list.count)
|
||||||
|
|
||||||
|
for i1 in 0..<batch.n_tokens {
|
||||||
|
let i = Int(i1)
|
||||||
|
batch.token[i] = tokens_list[i]
|
||||||
|
batch.pos[i] = i1
|
||||||
|
batch.n_seq_id[Int(i)] = 1
|
||||||
|
batch.seq_id[Int(i)]![0] = 0
|
||||||
|
batch.logits[i] = 0
|
||||||
|
}
|
||||||
|
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||||
|
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
n_cur = batch.n_tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
func completion_loop() -> String {
|
||||||
|
var new_token_id: llama_token = 0
|
||||||
|
|
||||||
|
let n_vocab = llama_n_vocab(model)
|
||||||
|
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
||||||
|
|
||||||
|
var candidates = Array<llama_token_data>()
|
||||||
|
candidates.reserveCapacity(Int(n_vocab))
|
||||||
|
|
||||||
|
for token_id in 0..<n_vocab {
|
||||||
|
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||||
|
}
|
||||||
|
candidates.withUnsafeMutableBufferPointer() { buffer in
|
||||||
|
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
|
||||||
|
|
||||||
|
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
||||||
|
}
|
||||||
|
|
||||||
|
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||||
|
print("\n")
|
||||||
|
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
return new_token_str
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_token_cchars = token_to_piece(token: new_token_id)
|
||||||
|
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
|
||||||
|
let new_token_str: String
|
||||||
|
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
new_token_str = string
|
||||||
|
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
|
||||||
|
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
|
||||||
|
let string = String(cString: temporary_invalid_cchars + [0])
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
new_token_str = string
|
||||||
|
} else {
|
||||||
|
new_token_str = ""
|
||||||
|
}
|
||||||
|
print(new_token_str)
|
||||||
|
// tokens_list.append(new_token_id)
|
||||||
|
|
||||||
|
batch.n_tokens = 0
|
||||||
|
|
||||||
|
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||||
|
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||||
|
batch.n_seq_id[Int(batch.n_tokens)] = 1
|
||||||
|
batch.seq_id[Int(batch.n_tokens)]![0] = 0
|
||||||
|
batch.logits[Int(batch.n_tokens)] = 1 // true
|
||||||
|
batch.n_tokens += 1
|
||||||
|
|
||||||
|
n_decode += 1
|
||||||
|
|
||||||
|
n_cur += 1
|
||||||
|
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("failed to evaluate llama!")
|
||||||
|
}
|
||||||
|
|
||||||
|
return new_token_str
|
||||||
|
}
|
||||||
|
|
||||||
|
func clear() {
|
||||||
|
tokens_list.removeAll()
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
|
let utf8Count = text.utf8.count
|
||||||
|
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||||
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
|
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||||
|
|
||||||
|
var swiftTokens: [llama_token] = []
|
||||||
|
for i in 0..<tokenCount {
|
||||||
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens.deallocate()
|
||||||
|
|
||||||
|
return swiftTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
/// - note: The result does not contain null-terminator
|
||||||
|
private func token_to_piece(token: llama_token) -> [CChar] {
|
||||||
|
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||||
|
result.initialize(repeating: Int8(0), count: 8)
|
||||||
|
defer {
|
||||||
|
result.deallocate()
|
||||||
|
}
|
||||||
|
let nTokens = llama_token_to_piece(model, token, result, 8)
|
||||||
|
|
||||||
|
if nTokens < 0 {
|
||||||
|
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
|
||||||
|
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
|
||||||
|
defer {
|
||||||
|
newResult.deallocate()
|
||||||
|
}
|
||||||
|
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||||
|
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||||
|
return Array(bufferPointer)
|
||||||
|
} else {
|
||||||
|
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||||
|
return Array(bufferPointer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
//
|
||||||
|
// Use this file to import your target's public headers that you would like to expose to Swift.
|
||||||
|
//
|
||||||
|
|
||||||
|
#import "llama.h"
|
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
|
@ -0,0 +1,481 @@
|
||||||
|
// !$*UTF8*$!
|
||||||
|
{
|
||||||
|
archiveVersion = 1;
|
||||||
|
classes = {
|
||||||
|
};
|
||||||
|
objectVersion = 56;
|
||||||
|
objects = {
|
||||||
|
|
||||||
|
/* Begin PBXBuildFile section */
|
||||||
|
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
|
||||||
|
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
|
||||||
|
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
|
||||||
|
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
|
||||||
|
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
|
||||||
|
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
|
||||||
|
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||||
|
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
|
||||||
|
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||||
|
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||||
|
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
|
||||||
|
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
|
||||||
|
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
|
||||||
|
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||||
|
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||||
|
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||||
|
/* End PBXBuildFile section */
|
||||||
|
|
||||||
|
/* Begin PBXFileReference section */
|
||||||
|
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
|
||||||
|
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
|
||||||
|
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
|
||||||
|
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
|
||||||
|
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
|
||||||
|
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
|
||||||
|
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
|
||||||
|
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
|
||||||
|
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
|
||||||
|
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
|
||||||
|
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
|
||||||
|
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
|
||||||
|
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
|
||||||
|
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||||
|
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
|
||||||
|
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
|
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||||
|
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
|
||||||
|
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||||
|
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
|
||||||
|
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
|
||||||
|
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
|
||||||
|
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||||
|
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||||
|
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||||
|
/* End PBXFileReference section */
|
||||||
|
|
||||||
|
/* Begin PBXFrameworksBuildPhase section */
|
||||||
|
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
|
||||||
|
isa = PBXFrameworksBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
files = (
|
||||||
|
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||||
|
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
};
|
||||||
|
/* End PBXFrameworksBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin PBXGroup section */
|
||||||
|
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
|
||||||
|
542376092B0D9C40008E6A1C /* ggml-backend.h */,
|
||||||
|
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
|
||||||
|
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
|
||||||
|
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
|
||||||
|
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
|
||||||
|
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
|
||||||
|
542EA09B2AC8723900A8AEE9 /* ggml.c */,
|
||||||
|
542EA09C2AC8723900A8AEE9 /* ggml.h */,
|
||||||
|
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
|
||||||
|
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
|
||||||
|
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
|
||||||
|
542EA0A22AC8729100A8AEE9 /* llama.h */,
|
||||||
|
);
|
||||||
|
name = llama.cpp;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A1C836A2AC328BD0096AF73 = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
|
||||||
|
8A907F312AC7134E006146EA /* llama.cpp.swift */,
|
||||||
|
8A3F84232AC4C891005E2EE8 /* models */,
|
||||||
|
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
|
||||||
|
8A1C83742AC328BD0096AF73 /* Products */,
|
||||||
|
8A39BE082AC7601000BFEB40 /* Frameworks */,
|
||||||
|
);
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A1C83742AC328BD0096AF73 /* Products */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
|
||||||
|
);
|
||||||
|
name = Products;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A3F84102AC4BD85005E2EE8 /* Resources */,
|
||||||
|
8A9F7C4B2AC332DC008AE1EA /* Models */,
|
||||||
|
8A9F7C4A2AC332BF008AE1EA /* UI */,
|
||||||
|
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
|
||||||
|
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
|
||||||
|
8A1C837C2AC328BE0096AF73 /* Preview Content */,
|
||||||
|
);
|
||||||
|
path = llama.swiftui;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
|
||||||
|
);
|
||||||
|
path = "Preview Content";
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||||
|
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||||
|
);
|
||||||
|
name = Frameworks;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A3F84112AC4BD8C005E2EE8 /* models */,
|
||||||
|
);
|
||||||
|
path = Resources;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A3F84112AC4BD8C005E2EE8 /* models */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
|
||||||
|
);
|
||||||
|
path = models;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
|
||||||
|
8A907F322AC7134E006146EA /* LibLlama.swift */,
|
||||||
|
);
|
||||||
|
path = llama.cpp.swift;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||||
|
);
|
||||||
|
path = UI;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
|
||||||
|
);
|
||||||
|
path = Models;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
/* End PBXGroup section */
|
||||||
|
|
||||||
|
/* Begin PBXNativeTarget section */
|
||||||
|
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
|
||||||
|
isa = PBXNativeTarget;
|
||||||
|
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
|
||||||
|
buildPhases = (
|
||||||
|
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||||
|
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||||
|
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||||
|
);
|
||||||
|
buildRules = (
|
||||||
|
);
|
||||||
|
dependencies = (
|
||||||
|
);
|
||||||
|
name = llama.swiftui;
|
||||||
|
packageProductDependencies = (
|
||||||
|
);
|
||||||
|
productName = llama.swiftui;
|
||||||
|
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||||
|
productType = "com.apple.product-type.application";
|
||||||
|
};
|
||||||
|
/* End PBXNativeTarget section */
|
||||||
|
|
||||||
|
/* Begin PBXProject section */
|
||||||
|
8A1C836B2AC328BD0096AF73 /* Project object */ = {
|
||||||
|
isa = PBXProject;
|
||||||
|
attributes = {
|
||||||
|
BuildIndependentTargetsInParallel = 1;
|
||||||
|
LastSwiftUpdateCheck = 1500;
|
||||||
|
LastUpgradeCheck = 1500;
|
||||||
|
TargetAttributes = {
|
||||||
|
8A1C83722AC328BD0096AF73 = {
|
||||||
|
CreatedOnToolsVersion = 15.0;
|
||||||
|
LastSwiftMigration = 1500;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
|
||||||
|
compatibilityVersion = "Xcode 14.0";
|
||||||
|
developmentRegion = en;
|
||||||
|
hasScannedForEncodings = 0;
|
||||||
|
knownRegions = (
|
||||||
|
en,
|
||||||
|
Base,
|
||||||
|
);
|
||||||
|
mainGroup = 8A1C836A2AC328BD0096AF73;
|
||||||
|
packageReferences = (
|
||||||
|
);
|
||||||
|
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
|
||||||
|
projectDirPath = "";
|
||||||
|
projectRoot = "";
|
||||||
|
targets = (
|
||||||
|
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
|
||||||
|
);
|
||||||
|
};
|
||||||
|
/* End PBXProject section */
|
||||||
|
|
||||||
|
/* Begin PBXResourcesBuildPhase section */
|
||||||
|
8A1C83712AC328BD0096AF73 /* Resources */ = {
|
||||||
|
isa = PBXResourcesBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
files = (
|
||||||
|
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
|
||||||
|
8A3F84242AC4C891005E2EE8 /* models in Resources */,
|
||||||
|
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
|
||||||
|
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
};
|
||||||
|
/* End PBXResourcesBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin PBXSourcesBuildPhase section */
|
||||||
|
8A1C836F2AC328BD0096AF73 /* Sources */ = {
|
||||||
|
isa = PBXSourcesBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
files = (
|
||||||
|
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
|
||||||
|
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
|
||||||
|
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
|
||||||
|
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
|
||||||
|
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
|
||||||
|
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
|
||||||
|
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||||
|
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||||
|
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
|
||||||
|
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
};
|
||||||
|
/* End PBXSourcesBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin XCBuildConfiguration section */
|
||||||
|
8A1C837F2AC328BE0096AF73 /* Debug */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
|
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||||
|
CLANG_ANALYZER_NONNULL = YES;
|
||||||
|
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||||
|
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
|
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||||
|
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_COMMA = YES;
|
||||||
|
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||||
|
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||||
|
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||||
|
CLANG_WARN_EMPTY_BODY = YES;
|
||||||
|
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||||
|
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||||
|
CLANG_WARN_INT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||||
|
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||||
|
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||||
|
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||||
|
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||||
|
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||||
|
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||||
|
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||||
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
COPY_PHASE_STRIP = NO;
|
||||||
|
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||||
|
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||||
|
ENABLE_TESTABILITY = YES;
|
||||||
|
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||||
|
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||||
|
GCC_DYNAMIC_NO_PIC = NO;
|
||||||
|
GCC_NO_COMMON_BLOCKS = YES;
|
||||||
|
GCC_OPTIMIZATION_LEVEL = 0;
|
||||||
|
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||||
|
"DEBUG=1",
|
||||||
|
"$(inherited)",
|
||||||
|
);
|
||||||
|
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||||
|
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||||
|
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||||
|
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||||
|
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||||
|
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||||
|
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||||
|
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||||
|
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||||
|
MTL_FAST_MATH = YES;
|
||||||
|
ONLY_ACTIVE_ARCH = YES;
|
||||||
|
SDKROOT = iphoneos;
|
||||||
|
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
|
||||||
|
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||||
|
};
|
||||||
|
name = Debug;
|
||||||
|
};
|
||||||
|
8A1C83802AC328BE0096AF73 /* Release */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
|
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||||
|
CLANG_ANALYZER_NONNULL = YES;
|
||||||
|
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||||
|
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
|
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||||
|
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_COMMA = YES;
|
||||||
|
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||||
|
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||||
|
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||||
|
CLANG_WARN_EMPTY_BODY = YES;
|
||||||
|
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||||
|
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||||
|
CLANG_WARN_INT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||||
|
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||||
|
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||||
|
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||||
|
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||||
|
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||||
|
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||||
|
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||||
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
COPY_PHASE_STRIP = NO;
|
||||||
|
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||||
|
ENABLE_NS_ASSERTIONS = NO;
|
||||||
|
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||||
|
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||||
|
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||||
|
GCC_NO_COMMON_BLOCKS = YES;
|
||||||
|
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||||
|
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||||
|
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||||
|
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||||
|
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||||
|
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||||
|
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||||
|
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||||
|
MTL_ENABLE_DEBUG_INFO = NO;
|
||||||
|
MTL_FAST_MATH = YES;
|
||||||
|
SDKROOT = iphoneos;
|
||||||
|
SWIFT_COMPILATION_MODE = wholemodule;
|
||||||
|
VALIDATE_PRODUCT = YES;
|
||||||
|
};
|
||||||
|
name = Release;
|
||||||
|
};
|
||||||
|
8A1C83822AC328BE0096AF73 /* Debug */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
|
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CODE_SIGN_STYLE = Automatic;
|
||||||
|
CURRENT_PROJECT_VERSION = 1;
|
||||||
|
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||||
|
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||||
|
ENABLE_PREVIEWS = YES;
|
||||||
|
GENERATE_INFOPLIST_FILE = YES;
|
||||||
|
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||||
|
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||||
|
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||||
|
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||||
|
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||||
|
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||||
|
LD_RUNPATH_SEARCH_PATHS = (
|
||||||
|
"$(inherited)",
|
||||||
|
"@executable_path/Frameworks",
|
||||||
|
);
|
||||||
|
MARKETING_VERSION = 1.0;
|
||||||
|
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||||
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
|
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||||
|
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||||
|
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||||
|
SWIFT_VERSION = 5.0;
|
||||||
|
TARGETED_DEVICE_FAMILY = "1,2";
|
||||||
|
};
|
||||||
|
name = Debug;
|
||||||
|
};
|
||||||
|
8A1C83832AC328BE0096AF73 /* Release */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
|
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CODE_SIGN_STYLE = Automatic;
|
||||||
|
CURRENT_PROJECT_VERSION = 1;
|
||||||
|
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||||
|
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||||
|
ENABLE_PREVIEWS = YES;
|
||||||
|
GENERATE_INFOPLIST_FILE = YES;
|
||||||
|
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||||
|
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||||
|
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||||
|
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||||
|
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||||
|
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||||
|
LD_RUNPATH_SEARCH_PATHS = (
|
||||||
|
"$(inherited)",
|
||||||
|
"@executable_path/Frameworks",
|
||||||
|
);
|
||||||
|
MARKETING_VERSION = 1.0;
|
||||||
|
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||||
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
|
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||||
|
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||||
|
SWIFT_VERSION = 5.0;
|
||||||
|
TARGETED_DEVICE_FAMILY = "1,2";
|
||||||
|
};
|
||||||
|
name = Release;
|
||||||
|
};
|
||||||
|
/* End XCBuildConfiguration section */
|
||||||
|
|
||||||
|
/* Begin XCConfigurationList section */
|
||||||
|
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
|
||||||
|
isa = XCConfigurationList;
|
||||||
|
buildConfigurations = (
|
||||||
|
8A1C837F2AC328BE0096AF73 /* Debug */,
|
||||||
|
8A1C83802AC328BE0096AF73 /* Release */,
|
||||||
|
);
|
||||||
|
defaultConfigurationIsVisible = 0;
|
||||||
|
defaultConfigurationName = Release;
|
||||||
|
};
|
||||||
|
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
|
||||||
|
isa = XCConfigurationList;
|
||||||
|
buildConfigurations = (
|
||||||
|
8A1C83822AC328BE0096AF73 /* Debug */,
|
||||||
|
8A1C83832AC328BE0096AF73 /* Release */,
|
||||||
|
);
|
||||||
|
defaultConfigurationIsVisible = 0;
|
||||||
|
defaultConfigurationName = Release;
|
||||||
|
};
|
||||||
|
/* End XCConfigurationList section */
|
||||||
|
};
|
||||||
|
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||||
|
}
|
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Workspace
|
||||||
|
version = "1.0">
|
||||||
|
<FileRef
|
||||||
|
location = "self:">
|
||||||
|
</FileRef>
|
||||||
|
</Workspace>
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||||
|
<plist version="1.0">
|
||||||
|
<dict>
|
||||||
|
<key>IDEDidComputeMac32BitWarning</key>
|
||||||
|
<true/>
|
||||||
|
</dict>
|
||||||
|
</plist>
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
"colors" : [
|
||||||
|
{
|
||||||
|
"idiom" : "universal"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"info" : {
|
||||||
|
"author" : "xcode",
|
||||||
|
"version" : 1
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"images" : [
|
||||||
|
{
|
||||||
|
"idiom" : "universal",
|
||||||
|
"platform" : "ios",
|
||||||
|
"size" : "1024x1024"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"info" : {
|
||||||
|
"author" : "xcode",
|
||||||
|
"version" : 1
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"info" : {
|
||||||
|
"author" : "xcode",
|
||||||
|
"version" : 1
|
||||||
|
}
|
||||||
|
}
|
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
@MainActor
|
||||||
|
class LlamaState: ObservableObject {
|
||||||
|
@Published var messageLog = ""
|
||||||
|
|
||||||
|
private var llamaContext: LlamaContext?
|
||||||
|
private var modelUrl: URL? {
|
||||||
|
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
|
||||||
|
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
|
||||||
|
}
|
||||||
|
init() {
|
||||||
|
do {
|
||||||
|
try loadModel()
|
||||||
|
} catch {
|
||||||
|
messageLog += "Error!\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func loadModel() throws {
|
||||||
|
messageLog += "Loading model...\n"
|
||||||
|
if let modelUrl {
|
||||||
|
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
|
||||||
|
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||||
|
} else {
|
||||||
|
messageLog += "Could not locate model\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func complete(text: String) async {
|
||||||
|
guard let llamaContext else {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
messageLog += "Attempting to complete text...\n"
|
||||||
|
await llamaContext.completion_init(text: text)
|
||||||
|
messageLog += "\(text)"
|
||||||
|
|
||||||
|
while await llamaContext.n_cur <= llamaContext.n_len {
|
||||||
|
let result = await llamaContext.completion_loop()
|
||||||
|
messageLog += "\(result)"
|
||||||
|
}
|
||||||
|
await llamaContext.clear()
|
||||||
|
messageLog += "\n\ndone\n"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"info" : {
|
||||||
|
"author" : "xcode",
|
||||||
|
"version" : 1
|
||||||
|
}
|
||||||
|
}
|
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
import SwiftUI
|
||||||
|
|
||||||
|
struct ContentView: View {
|
||||||
|
@StateObject var llamaState = LlamaState()
|
||||||
|
|
||||||
|
@State private var multiLineText = ""
|
||||||
|
|
||||||
|
var body: some View {
|
||||||
|
VStack {
|
||||||
|
ScrollView(.vertical) {
|
||||||
|
Text(llamaState.messageLog)
|
||||||
|
}
|
||||||
|
|
||||||
|
TextEditor(text: $multiLineText)
|
||||||
|
.frame(height: 200)
|
||||||
|
.padding()
|
||||||
|
.border(Color.gray, width: 0.5)
|
||||||
|
Button(action: {
|
||||||
|
sendText()
|
||||||
|
}) {
|
||||||
|
Text("Send")
|
||||||
|
.padding()
|
||||||
|
.background(Color.blue)
|
||||||
|
.foregroundColor(.white)
|
||||||
|
.cornerRadius(8)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding()
|
||||||
|
}
|
||||||
|
|
||||||
|
func sendText() {
|
||||||
|
Task {
|
||||||
|
await llamaState.complete(text: multiLineText)
|
||||||
|
multiLineText = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
#Preview {
|
||||||
|
ContentView()
|
||||||
|
}
|
||||||
|
*/
|
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import SwiftUI
|
||||||
|
|
||||||
|
@main
|
||||||
|
struct llama_swiftuiApp: App {
|
||||||
|
var body: some Scene {
|
||||||
|
WindowGroup {
|
||||||
|
ContentView()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,7 +5,7 @@ import json
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gguf import *
|
from gguf import *
|
||||||
from transformers import CLIPModel, CLIPProcessor
|
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
|
||||||
|
|
||||||
TEXT = "clip.text"
|
TEXT = "clip.text"
|
||||||
VISION = "clip.vision"
|
VISION = "clip.vision"
|
||||||
|
@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
|
||||||
help="Save a text-only model. It can't be used to encode images")
|
help="Save a text-only model. It can't be used to encode images")
|
||||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||||
help="Save a vision-only model. It can't be used to encode texts")
|
help="Save a vision-only model. It can't be used to encode texts")
|
||||||
|
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||||
|
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
|
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||||
|
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||||
|
ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||||
|
|
||||||
|
# with proper
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,15 +104,22 @@ if args.use_f32:
|
||||||
# output in the same directory as the model if output_dir is None
|
# output in the same directory as the model if output_dir is None
|
||||||
dir_model = args.model_dir
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
if args.clip_model_is_vision:
|
||||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
vocab = None
|
||||||
vocab = json.load(f)
|
tokens = None
|
||||||
tokens = [key for key in vocab]
|
else:
|
||||||
|
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
tokens = [key for key in vocab]
|
||||||
|
|
||||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
v_hparams = config["vision_config"]
|
if args.clip_model_is_vision:
|
||||||
t_hparams = config["text_config"]
|
v_hparams = config
|
||||||
|
t_hparams = None
|
||||||
|
else:
|
||||||
|
v_hparams = config["vision_config"]
|
||||||
|
t_hparams = config["text_config"]
|
||||||
|
|
||||||
# possible data types
|
# possible data types
|
||||||
# ftype == 0 -> float32
|
# ftype == 0 -> float32
|
||||||
|
@ -117,9 +132,12 @@ ftype = 1
|
||||||
if args.use_f32:
|
if args.use_f32:
|
||||||
ftype = 0
|
ftype = 0
|
||||||
|
|
||||||
|
if args.clip_model_is_vision:
|
||||||
model = CLIPModel.from_pretrained(dir_model)
|
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||||
processor = CLIPProcessor.from_pretrained(dir_model)
|
processor = None
|
||||||
|
else:
|
||||||
|
model = CLIPModel.from_pretrained(dir_model)
|
||||||
|
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||||
|
|
||||||
fname_middle = None
|
fname_middle = None
|
||||||
has_text_encoder = True
|
has_text_encoder = True
|
||||||
|
@ -128,13 +146,13 @@ has_llava_projector = False
|
||||||
if args.text_only:
|
if args.text_only:
|
||||||
fname_middle = "text-"
|
fname_middle = "text-"
|
||||||
has_vision_encoder = False
|
has_vision_encoder = False
|
||||||
elif args.vision_only:
|
|
||||||
fname_middle = "vision-"
|
|
||||||
has_text_encoder = False
|
|
||||||
elif args.llava_projector is not None:
|
elif args.llava_projector is not None:
|
||||||
fname_middle = "mmproj-"
|
fname_middle = "mmproj-"
|
||||||
has_text_encoder = False
|
has_text_encoder = False
|
||||||
has_llava_projector = True
|
has_llava_projector = True
|
||||||
|
elif args.vision_only:
|
||||||
|
fname_middle = "vision-"
|
||||||
|
has_text_encoder = False
|
||||||
else:
|
else:
|
||||||
fname_middle = ""
|
fname_middle = ""
|
||||||
|
|
||||||
|
@ -182,8 +200,12 @@ if has_vision_encoder:
|
||||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
|
|
||||||
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
|
if processor is not None:
|
||||||
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||||
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||||
|
else:
|
||||||
|
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||||
|
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||||
fout.add_array("clip.vision.image_mean", image_mean)
|
fout.add_array("clip.vision.image_mean", image_mean)
|
||||||
fout.add_array("clip.vision.image_std", image_std)
|
fout.add_array("clip.vision.image_std", image_std)
|
||||||
|
|
||||||
|
|
5
examples/lookahead/CMakeLists.txt
Normal file
5
examples/lookahead/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET lookahead)
|
||||||
|
add_executable(${TARGET} lookahead.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7
examples/lookahead/README.md
Normal file
7
examples/lookahead/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# llama.cpp/examples/lookahead
|
||||||
|
|
||||||
|
Demonstartion of lookahead decoding technique:
|
||||||
|
|
||||||
|
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
|
||||||
|
More info: https://github.com/ggerganov/llama.cpp/pull/4207
|
487
examples/lookahead/lookahead.cpp
Normal file
487
examples/lookahead/lookahead.cpp
Normal file
|
@ -0,0 +1,487 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct ngram_data {
|
||||||
|
bool active = false;
|
||||||
|
|
||||||
|
llama_seq_id seq_id = -1;
|
||||||
|
|
||||||
|
std::vector<int> i_batch;
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
// n-gram container
|
||||||
|
struct ngram_container {
|
||||||
|
ngram_container(int n_vocab, int N, int G) {
|
||||||
|
cnt.resize(n_vocab);
|
||||||
|
head.resize(n_vocab);
|
||||||
|
tokens.resize(n_vocab * G * (N - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_total = 0;
|
||||||
|
|
||||||
|
std::vector<int> cnt;
|
||||||
|
std::vector<int> head;
|
||||||
|
|
||||||
|
// [n_vocab][G][N - 1]
|
||||||
|
// for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (gpt_params_parse(argc, argv, params) == false) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int W = 15; // lookahead window
|
||||||
|
const int N = 5; // n-gram size
|
||||||
|
const int G = 15; // max verification n-grams
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_set_target(log_filename_generator("lookahead", "log"));
|
||||||
|
LOG_TEE("Log start\n");
|
||||||
|
log_dump_cmdline(argc, argv);
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
// init llama.cpp
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model * model = NULL;
|
||||||
|
llama_context * ctx = NULL;
|
||||||
|
|
||||||
|
// load the target model
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos);
|
||||||
|
|
||||||
|
std::vector<llama_token> inp;
|
||||||
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
|
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
|
all = inp;
|
||||||
|
|
||||||
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
|
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
for (auto id : inp) {
|
||||||
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
const int n_input = inp.size();
|
||||||
|
|
||||||
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
// eval the prompt
|
||||||
|
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
|
||||||
|
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||||
|
|
||||||
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_enc_end = ggml_time_us();
|
||||||
|
|
||||||
|
int n_predict = 0;
|
||||||
|
int n_accept = 0;
|
||||||
|
|
||||||
|
int n_past = inp.size();
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
// used to determine end of generation
|
||||||
|
bool has_eos = false;
|
||||||
|
|
||||||
|
// for each decoded batch, we have at most W + G + 1 distinct sequences:
|
||||||
|
// seq_id == 0 : the current input token
|
||||||
|
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
|
||||||
|
// seq_id [W + 1, W + G] : verification n-grams
|
||||||
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
|
// target model sampling context
|
||||||
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
|
||||||
|
// verification n-grams
|
||||||
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
|
|
||||||
|
// tokens for the past N - 1 Jacobi iterations
|
||||||
|
std::vector<llama_token> tokens_j_prev(W);
|
||||||
|
std::vector<std::vector<llama_token>> tokens_j(N - 1);
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
tokens_j[j].resize(W);
|
||||||
|
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
// there are different ways to init these tokens
|
||||||
|
if (0) {
|
||||||
|
// initialize randomly from the prompt tokens
|
||||||
|
tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
|
||||||
|
} else {
|
||||||
|
// initialize with a sequence of increasing numbers
|
||||||
|
tokens_j[j][i] = 100 + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_seq_id> seq_id_look;
|
||||||
|
|
||||||
|
// the input token belongs both to all sequences
|
||||||
|
std::vector<llama_seq_id> seq_id_all(W + G + 1);
|
||||||
|
for (int i = 0; i < W + G + 1; i++) {
|
||||||
|
seq_id_all[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// here we keep adding new n-grams as we go
|
||||||
|
ngram_container ngrams_observed(llama_n_vocab(model), N, G);
|
||||||
|
|
||||||
|
// debug
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
||||||
|
|
||||||
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
|
// sample first token
|
||||||
|
{
|
||||||
|
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
|
{
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// debug
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
//
|
||||||
|
// Example for W = 5, N = 4, G = 2:
|
||||||
|
// (I = input, L = lookahead, V = verification)
|
||||||
|
//
|
||||||
|
// Batch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||||
|
// T: -2 -2 -2 -2 -1 -1 -1 -1 -1 0 0 0 0 0 0
|
||||||
|
// Info: I L L L L L L L L L L L L L L V V V V V V
|
||||||
|
// Pos: 0 1 2 3 4 1 2 3 4 5 2 3 4 5 6 1 2 3 1 2 3 (+ n_past)
|
||||||
|
// Logits: 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
// ---------------------------------------------------------------------
|
||||||
|
// Seq: 0
|
||||||
|
// 1 1 1
|
||||||
|
// 2 2 2 2
|
||||||
|
// 3 3 3 3 3
|
||||||
|
// 4 4 4 4 4 4
|
||||||
|
// 5 5 5 5 5 5 5
|
||||||
|
// 6 6 6 6
|
||||||
|
// 7 7 7 7
|
||||||
|
// ---------------------------------------------------------------------
|
||||||
|
// | | | | | | | | | | |
|
||||||
|
// V V V V V | | | | | |
|
||||||
|
// j_tokens | | | | | |
|
||||||
|
// V V V V V V
|
||||||
|
// id
|
||||||
|
{
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
// current token - first token of the first level
|
||||||
|
llama_batch_add(batch, id, n_past, seq_id_all, true);
|
||||||
|
|
||||||
|
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
||||||
|
{
|
||||||
|
const int g_cur = ngrams_observed.cnt[id];
|
||||||
|
|
||||||
|
ngrams_cur.resize(g_cur);
|
||||||
|
for (int g = 0; g < g_cur; g++) {
|
||||||
|
ngrams_cur[g].active = true;
|
||||||
|
ngrams_cur[g].tokens.resize(N);
|
||||||
|
ngrams_cur[g].i_batch.resize(N);
|
||||||
|
ngrams_cur[g].seq_id = W + 1 + g;
|
||||||
|
ngrams_cur[g].i_batch[0] = 0;
|
||||||
|
ngrams_cur[g].tokens [0] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
for (int g = 0; g < g_cur; g++) {
|
||||||
|
const int idx = id*(N - 1)*G + g*(N - 1);
|
||||||
|
|
||||||
|
const llama_token t = ngrams_observed.tokens[idx + j];
|
||||||
|
|
||||||
|
ngrams_cur[g].tokens [j + 1] = t;
|
||||||
|
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
||||||
|
|
||||||
|
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill the remaining W - 1 tokens for the first level
|
||||||
|
for (int i = 1; i < W; i++) {
|
||||||
|
seq_id_look.resize(W - i);
|
||||||
|
for (int j = 0; j < W - i; j++) {
|
||||||
|
seq_id_look[j] = i + j + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill the rest of the levels
|
||||||
|
for (int j = 1; j < N - 1; j++) {
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int seq_id_best = 0;
|
||||||
|
|
||||||
|
for (int v = 0; v < N; ++v) {
|
||||||
|
int i_batch = 0;
|
||||||
|
|
||||||
|
// if no active ngrams are left, it means the sampled token does not pass the verification
|
||||||
|
if (v > 0) {
|
||||||
|
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||||
|
if (ngrams_cur[g].active) {
|
||||||
|
i_batch = ngrams_cur[g].i_batch[v];
|
||||||
|
seq_id_best = ngrams_cur[g].seq_id;
|
||||||
|
|
||||||
|
++n_accept;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// no more matches -> create a new batch
|
||||||
|
if (i_batch == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sample the next token
|
||||||
|
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
|
// print
|
||||||
|
{
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
|
if (v == 0) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
} else {
|
||||||
|
// print light cyan
|
||||||
|
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (id == llama_token_eos(model)) {
|
||||||
|
has_eos = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
all.push_back(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
++n_predict;
|
||||||
|
++n_past;
|
||||||
|
|
||||||
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify across active n-grams
|
||||||
|
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||||
|
if (ngrams_cur[g].active) {
|
||||||
|
if (v == N - 1) {
|
||||||
|
ngrams_cur[g].active = false;
|
||||||
|
} else {
|
||||||
|
if (id != ngrams_cur[g].tokens[v + 1]) {
|
||||||
|
ngrams_cur[g].active = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// print known n-grams starting with token id (debug)
|
||||||
|
if (0 && v == 0) {
|
||||||
|
if (ngrams_observed.cnt[id] > 0) {
|
||||||
|
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||||
|
printf(" - ngram %2d: ", i);
|
||||||
|
|
||||||
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||||
|
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update lookahead tokens
|
||||||
|
{
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
tokens_j_prev[i] = tokens_j[0][i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 2; j++) {
|
||||||
|
tokens_j[j] = tokens_j[j + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (v == 0) {
|
||||||
|
// sample from the last level
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
// there are different ways to init these tokens
|
||||||
|
if (0) {
|
||||||
|
// random init
|
||||||
|
tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
|
||||||
|
} else {
|
||||||
|
// init from the previous level
|
||||||
|
tokens_j[N - 2][i] = tokens_j[0][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update observed ngrams
|
||||||
|
if (v == 0) {
|
||||||
|
// the first token of the n-gram is determined by the index in the container so it is not stored
|
||||||
|
std::vector<llama_token> ngram(N - 1);
|
||||||
|
|
||||||
|
// n-gram generation
|
||||||
|
// ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
|
||||||
|
for (int f = 0; f < W; ++f) {
|
||||||
|
const int ft = tokens_j_prev[f]; // first token of the n-gram
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; ++j) {
|
||||||
|
ngram[j] = tokens_j[j][f];
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter-out repeating n-grams
|
||||||
|
{
|
||||||
|
bool is_unique = true;
|
||||||
|
|
||||||
|
for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
|
||||||
|
const int idx = ft*(N - 1)*G + k*(N - 1);
|
||||||
|
|
||||||
|
bool is_match = true;
|
||||||
|
for (int j = 0; j < N - 1; ++j) {
|
||||||
|
if (ngrams_observed.tokens[idx + j] != ngram[j]) {
|
||||||
|
is_match = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_match) {
|
||||||
|
is_unique = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_unique) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int head = ngrams_observed.head[ft];
|
||||||
|
const int idx = ft*(N - 1)*G + head*(N - 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < N - 1; i++) {
|
||||||
|
ngrams_observed.tokens[idx + i] = ngram[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
ngrams_observed.cnt[ft] = std::min(G, ngrams_observed.cnt[ft] + 1);
|
||||||
|
ngrams_observed.head[ft] = (head + 1) % G;
|
||||||
|
|
||||||
|
ngrams_observed.n_total++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// KV cache management
|
||||||
|
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||||
|
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
|
||||||
|
|
||||||
|
if (seq_id_best != 0) {
|
||||||
|
// if a verification token matched, we keep the best sequence and remove the rest
|
||||||
|
// this leads to some KV cache fragmentation
|
||||||
|
llama_kv_cache_seq_keep(ctx, seq_id_best);
|
||||||
|
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||||
|
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
|
||||||
|
|
||||||
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
|
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("W = %2d\n", W);
|
||||||
|
LOG_TEE("N = %2d\n", N);
|
||||||
|
LOG_TEE("G = %2d\n", G);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_kv_cache_view_free(&kvc_view);
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -100,6 +100,12 @@ static void sigint_handler(int signo) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
(void) level;
|
||||||
|
(void) user_data;
|
||||||
|
LOG_TEE("%s", text);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
@ -113,6 +119,7 @@ int main(int argc, char ** argv) {
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
log_dump_cmdline(argc, argv);
|
log_dump_cmdline(argc, argv);
|
||||||
|
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
// TODO: Dump params ?
|
// TODO: Dump params ?
|
||||||
|
@ -234,8 +241,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
|
if (params.chatml) {
|
||||||
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
|
@ -313,7 +323,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
// number of tokens to keep when resetting context
|
||||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
|
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -324,11 +334,23 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
// chatml prefix & suffix
|
||||||
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||||
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
// similar for chatml mode
|
||||||
|
else if (params.chatml) {
|
||||||
|
params.interactive_first = true;
|
||||||
|
params.antiprompt.push_back("<|im_start|>user\n");
|
||||||
|
}
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
|
@ -415,6 +437,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||||
|
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -705,7 +728,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
} else if (params.instruct) {
|
} else if (params.instruct || params.chatml) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -713,7 +736,7 @@ int main(int argc, char ** argv) {
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct) {
|
if (params.instruct || params.chatml) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -760,6 +783,12 @@ int main(int argc, char ** argv) {
|
||||||
n_consumed = embd_inp.size();
|
n_consumed = embd_inp.size();
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert user chat prefix
|
||||||
|
if (params.chatml && !is_antiprompt) {
|
||||||
|
LOG("inserting chatml prefix\n");
|
||||||
|
n_consumed = embd_inp.size();
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||||
|
}
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
process_escapes(buffer);
|
process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
@ -778,6 +807,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inserting instruction suffix\n");
|
LOG("inserting instruction suffix\n");
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert assistant chat suffix
|
||||||
|
if (params.chatml) {
|
||||||
|
LOG("inserting chatml suffix\n");
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
|
@ -803,7 +837,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG_TEE(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submite requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
|
||||||
// insert new requests as soon as the previous one is done
|
// insert new requests as soon as the previous one is done
|
||||||
const bool cont_batching = params.cont_batching;
|
const bool cont_batching = params.cont_batching;
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("parallel", "log"));
|
log_set_target(log_filename_generator("parallel", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
|
@ -172,6 +174,8 @@ int main(int argc, char ** argv) {
|
||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
int32_t n_cache_miss = 0;
|
int32_t n_cache_miss = 0;
|
||||||
|
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
|
@ -201,6 +205,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_TEE("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
|
|
|
@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
cparams.n_ctx = 256;
|
cparams.n_ctx = 256;
|
||||||
cparams.seed = 1;
|
cparams.seed = 1;
|
||||||
cparams.f16_kv = false;
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, cparams);
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
|
|
@ -234,6 +234,55 @@ node index.js
|
||||||
|
|
||||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||||
|
|
||||||
|
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
You can use either Python `openai` library with appropriate checkpoints:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||||
|
api_key = "sk-no-key-required"
|
||||||
|
)
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
|
||||||
|
{"role": "user", "content": "Write a limerick about python exceptions"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message)
|
||||||
|
```
|
||||||
|
... or raw HTTP requests:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write a limerick about python exceptions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -11,10 +11,10 @@ app = Flask(__name__)
|
||||||
slot_id = -1
|
slot_id = -1
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||||
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
|
||||||
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
|
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
|
||||||
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
|
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
|
||||||
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
|
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
|
||||||
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
||||||
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
||||||
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
||||||
|
@ -34,19 +34,19 @@ def is_present(json, key):
|
||||||
|
|
||||||
#convert chat to prompt
|
#convert chat to prompt
|
||||||
def convert_chat(messages):
|
def convert_chat(messages):
|
||||||
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
|
||||||
|
|
||||||
system_n = args.system_name.replace("\\n", "\n")
|
system_n = args.system_name
|
||||||
user_n = args.user_name.replace("\\n", "\n")
|
user_n = args.user_name
|
||||||
ai_n = args.ai_name.replace("\\n", "\n")
|
ai_n = args.ai_name
|
||||||
stop = args.stop.replace("\\n", "\n")
|
stop = args.stop
|
||||||
|
|
||||||
|
prompt = "" + args.chat_prompt + stop
|
||||||
|
|
||||||
for line in messages:
|
for line in messages:
|
||||||
if (line["role"] == "system"):
|
if (line["role"] == "system"):
|
||||||
prompt += f"{system_n}{line['content']}"
|
prompt += f"{system_n}{line['content']}{stop}"
|
||||||
if (line["role"] == "user"):
|
if (line["role"] == "user"):
|
||||||
prompt += f"{user_n}{line['content']}"
|
prompt += f"{user_n}{line['content']}{stop}"
|
||||||
if (line["role"] == "assistant"):
|
if (line["role"] == "assistant"):
|
||||||
prompt += f"{ai_n}{line['content']}{stop}"
|
prompt += f"{ai_n}{line['content']}{stop}"
|
||||||
prompt += ai_n.rstrip()
|
prompt += ai_n.rstrip()
|
||||||
|
@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
|
||||||
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
||||||
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
||||||
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
||||||
|
if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
|
||||||
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
||||||
if (args.stop != ""):
|
if (args.stop != ""):
|
||||||
postData["stop"] = [args.stop]
|
postData["stop"] = [args.stop]
|
||||||
|
@ -130,7 +131,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
slot_id = data["slot_id"]
|
slot_id = data.get("slot_id")
|
||||||
if (chat):
|
if (chat):
|
||||||
if (start):
|
if (start):
|
||||||
resData["choices"][0]["delta"] = {
|
resData["choices"][0]["delta"] = {
|
||||||
|
@ -150,11 +151,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
return resData
|
return resData
|
||||||
|
|
||||||
|
|
||||||
@app.route('/chat/completions', methods=['POST'])
|
@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
|
||||||
@app.route('/v1/chat/completions', methods=['POST'])
|
@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
|
||||||
def chat_completions():
|
def chat_completions():
|
||||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
return Response(status=403)
|
return Response(status=403)
|
||||||
|
if request.method == 'OPTIONS':
|
||||||
|
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
body = request.get_json()
|
body = request.get_json()
|
||||||
stream = False
|
stream = False
|
||||||
tokenize = False
|
tokenize = False
|
||||||
|
@ -177,20 +180,22 @@ def chat_completions():
|
||||||
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
time_now = int(time.time())
|
time_now = int(time.time())
|
||||||
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
for line in data.iter_lines():
|
for line in data.iter_lines():
|
||||||
if line:
|
if line:
|
||||||
decoded_line = line.decode('utf-8')
|
decoded_line = line.decode('utf-8')
|
||||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
|
|
||||||
|
|
||||||
@app.route('/completions', methods=['POST'])
|
@app.route('/completions', methods=['POST', 'OPTIONS'])
|
||||||
@app.route('/v1/completions', methods=['POST'])
|
@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
|
||||||
def completion():
|
def completion():
|
||||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
return Response(status=403)
|
return Response(status=403)
|
||||||
|
if request.method == 'OPTIONS':
|
||||||
|
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
body = request.get_json()
|
body = request.get_json()
|
||||||
stream = False
|
stream = False
|
||||||
tokenize = False
|
tokenize = False
|
||||||
|
@ -216,8 +221,8 @@ def completion():
|
||||||
if line:
|
if line:
|
||||||
decoded_line = line.decode('utf-8')
|
decoded_line = line.decode('utf-8')
|
||||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(args.host, port=args.port)
|
app.run(args.host, port=args.port)
|
||||||
|
|
|
@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (result.error) {
|
||||||
|
result.error = JSON.parse(result.error);
|
||||||
|
console.error(`llama.cpp error: ${result.error.content}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
#define SERVER_VERBOSE 1
|
#define SERVER_VERBOSE 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
struct server_params
|
struct server_params
|
||||||
|
@ -59,6 +61,10 @@ static bool server_verbose = false;
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
|
json oaicompat_completion_params_parse(const json &body);
|
||||||
|
std::string format_chatml(std::vector<json> messages);
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
|
@ -149,15 +155,23 @@ struct task_server {
|
||||||
json data;
|
json data;
|
||||||
bool infill_mode = false;
|
bool infill_mode = false;
|
||||||
bool embedding_mode = false;
|
bool embedding_mode = false;
|
||||||
|
int multitask_id = -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_result {
|
struct task_result {
|
||||||
int id;
|
int id;
|
||||||
|
int multitask_id = -1;
|
||||||
bool stop;
|
bool stop;
|
||||||
bool error;
|
bool error;
|
||||||
json result_json;
|
json result_json;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct task_multi {
|
||||||
|
int id;
|
||||||
|
std::set<int> subtasks_remaining{};
|
||||||
|
std::vector<task_result> results{};
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: can become bool if we can't find use of more states
|
// TODO: can become bool if we can't find use of more states
|
||||||
enum slot_state
|
enum slot_state
|
||||||
{
|
{
|
||||||
|
@ -378,6 +392,9 @@ struct llama_client_slot
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
|
|
||||||
|
bool oaicompat = false;
|
||||||
|
std::string oaicompat_model;
|
||||||
|
|
||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
|
@ -397,6 +414,9 @@ struct llama_client_slot
|
||||||
double t_prompt_processing; // ms
|
double t_prompt_processing; // ms
|
||||||
double t_token_generation; // ms
|
double t_token_generation; // ms
|
||||||
|
|
||||||
|
// multitasks
|
||||||
|
int multitask_id = -1;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
num_prompt_tokens = 0;
|
num_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
|
@ -477,7 +497,7 @@ struct llama_client_slot
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_timings() {
|
void print_timings() const {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||||
|
@ -520,7 +540,8 @@ struct llama_server_context
|
||||||
|
|
||||||
std::vector<task_server> queue_tasks;
|
std::vector<task_server> queue_tasks;
|
||||||
std::vector<task_result> queue_results;
|
std::vector<task_result> queue_results;
|
||||||
std::mutex mutex_tasks;
|
std::vector<task_multi> queue_multitasks;
|
||||||
|
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
|
||||||
std::mutex mutex_results;
|
std::mutex mutex_results;
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context()
|
||||||
|
@ -609,6 +630,11 @@ struct llama_server_context
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||||
{
|
{
|
||||||
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
|
const bool TMP_FORCE_SPECIAL = true;
|
||||||
|
|
||||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
@ -624,12 +650,12 @@ struct llama_server_context
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
if (first)
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, add_bos);
|
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, false);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
}
|
}
|
||||||
|
@ -646,7 +672,7 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -677,6 +703,14 @@ struct llama_server_context
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
|
if (data.count("__oaicompat") != 0) {
|
||||||
|
slot->oaicompat = true;
|
||||||
|
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
} else {
|
||||||
|
slot->oaicompat = false;
|
||||||
|
slot->oaicompat_model = "";
|
||||||
|
}
|
||||||
|
|
||||||
slot->params.stream = json_value(data, "stream", false);
|
slot->params.stream = json_value(data, "stream", false);
|
||||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
|
@ -1090,16 +1124,40 @@ struct llama_server_context
|
||||||
return slot.images.size() > 0;
|
return slot.images.size() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_error(int id, std::string error)
|
void send_error(task_server& task, std::string error)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = id;
|
res.id = task.id;
|
||||||
|
res.multitask_id = task.multitask_id;
|
||||||
|
res.stop = false;
|
||||||
res.error = true;
|
res.error = true;
|
||||||
res.result_json = { { "content", error } };
|
res.result_json = { { "content", error } };
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void add_multi_task(int id, std::vector<int>& sub_ids)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
task_multi multi;
|
||||||
|
multi.id = id;
|
||||||
|
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||||
|
queue_multitasks.push_back(multi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
for (auto& multitask : queue_multitasks)
|
||||||
|
{
|
||||||
|
if (multitask.id == multitask_id)
|
||||||
|
{
|
||||||
|
multitask.subtasks_remaining.erase(subtask_id);
|
||||||
|
multitask.results.push_back(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
json get_model_props()
|
json get_model_props()
|
||||||
{
|
{
|
||||||
return get_formated_generation(slots[0]);
|
return get_formated_generation(slots[0]);
|
||||||
|
@ -1144,6 +1202,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = false;
|
res.stop = false;
|
||||||
|
|
||||||
|
@ -1169,6 +1228,12 @@ struct llama_server_context
|
||||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.oaicompat)
|
||||||
|
{
|
||||||
|
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||||
|
res.result_json["model"] = slot.oaicompat_model;
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1177,6 +1242,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
|
@ -1216,6 +1282,18 @@ struct llama_server_context
|
||||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.oaicompat)
|
||||||
|
{
|
||||||
|
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||||
|
res.result_json["model"] = slot.oaicompat_model;
|
||||||
|
}
|
||||||
|
|
||||||
|
// parent multitask, if any, needs to be updated
|
||||||
|
if (slot.multitask_id != -1)
|
||||||
|
{
|
||||||
|
update_multi_task(slot.multitask_id, slot.task_id, res);
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1224,6 +1302,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
|
@ -1250,15 +1329,26 @@ struct llama_server_context
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
int request_completion(json data, bool infill, bool embedding)
|
int request_completion(json data, bool infill, bool embedding, int multitask_id)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
task.data = data;
|
task.target_id = 0;
|
||||||
|
task.data = std::move(data);
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
task.embedding_mode = embedding;
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
|
task.multitask_id = multitask_id;
|
||||||
|
|
||||||
|
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||||
|
if (task.data.at("prompt").size() > 1)
|
||||||
|
{
|
||||||
|
lock.unlock(); // entering new func scope
|
||||||
|
return split_multiprompt_task(task);
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, it's a single-prompt task, we actually queue it
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
return task.id;
|
return task.id;
|
||||||
}
|
}
|
||||||
|
@ -1277,8 +1367,17 @@ struct llama_server_context
|
||||||
|
|
||||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||||
{
|
{
|
||||||
|
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||||
|
if (queue_results[i].multitask_id == task_id)
|
||||||
|
{
|
||||||
|
update_multi_task(task_id, queue_results[i].id, queue_results[i]);
|
||||||
|
queue_results.erase(queue_results.begin() + i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (queue_results[i].id == task_id)
|
if (queue_results[i].id == task_id)
|
||||||
{
|
{
|
||||||
|
assert(queue_results[i].multitask_id == -1);
|
||||||
task_result res = queue_results[i];
|
task_result res = queue_results[i];
|
||||||
queue_results.erase(queue_results.begin() + i);
|
queue_results.erase(queue_results.begin() + i);
|
||||||
return res;
|
return res;
|
||||||
|
@ -1368,6 +1467,27 @@ struct llama_server_context
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int split_multiprompt_task(task_server& multiprompt_task)
|
||||||
|
{
|
||||||
|
int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||||
|
assert(prompt_count > 1);
|
||||||
|
|
||||||
|
int multitask_id = id_gen++;
|
||||||
|
std::vector<int> subtask_ids(prompt_count);
|
||||||
|
for (int i = 0; i < prompt_count; i++)
|
||||||
|
{
|
||||||
|
json subtask_data = multiprompt_task.data;
|
||||||
|
subtask_data["prompt"] = subtask_data["prompt"][i];
|
||||||
|
|
||||||
|
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||||
|
subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// queue up the multitask so we can track its subtask progression
|
||||||
|
add_multi_task(multitask_id, subtask_ids);
|
||||||
|
return multitask_id;
|
||||||
|
}
|
||||||
|
|
||||||
void process_tasks()
|
void process_tasks()
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
@ -1383,7 +1503,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
LOG_TEE("slot unavailable\n");
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "slot unavailable");
|
send_error(task, "slot unavailable");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1397,11 +1517,12 @@ struct llama_server_context
|
||||||
slot->infill = task.infill_mode;
|
slot->infill = task.infill_mode;
|
||||||
slot->embedding = task.embedding_mode;
|
slot->embedding = task.embedding_mode;
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
|
slot->multitask_id = task.multitask_id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(slot, task.data))
|
||||||
{
|
{
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "internal_error");
|
send_error(task, "internal_error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1417,6 +1538,38 @@ struct llama_server_context
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
||||||
|
auto queue_iterator = queue_multitasks.begin();
|
||||||
|
while (queue_iterator != queue_multitasks.end())
|
||||||
|
{
|
||||||
|
if (queue_iterator->subtasks_remaining.empty())
|
||||||
|
{
|
||||||
|
// all subtasks done == multitask is done
|
||||||
|
task_result aggregate_result;
|
||||||
|
aggregate_result.id = queue_iterator->id;
|
||||||
|
aggregate_result.stop = true;
|
||||||
|
aggregate_result.error = false;
|
||||||
|
|
||||||
|
// collect json results into one json result
|
||||||
|
std::vector<json> result_jsons;
|
||||||
|
for (auto& subres : queue_iterator->results)
|
||||||
|
{
|
||||||
|
result_jsons.push_back(subres.result_json);
|
||||||
|
aggregate_result.error = aggregate_result.error && subres.error;
|
||||||
|
}
|
||||||
|
aggregate_result.result_json = json{ "results", result_jsons };
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
|
queue_results.push_back(aggregate_result);
|
||||||
|
|
||||||
|
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++queue_iterator;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool update_slots() {
|
bool update_slots() {
|
||||||
|
@ -1808,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1954,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
|
||||||
{
|
|
||||||
params.memory_f16 = false;
|
|
||||||
}
|
|
||||||
else if (arg == "--threads" || arg == "-t")
|
else if (arg == "--threads" || arg == "-t")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2162,6 +2312,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.mmproj = argv[i];
|
params.mmproj = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (arg == "--log-disable")
|
||||||
|
{
|
||||||
|
log_set_target(stdout);
|
||||||
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
@ -2178,6 +2333,232 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static std::string random_string()
|
||||||
|
{
|
||||||
|
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 generator(rd());
|
||||||
|
|
||||||
|
std::string result(32, ' ');
|
||||||
|
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
result[i] = str[generator() % str.size()];
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string gen_chatcmplid()
|
||||||
|
{
|
||||||
|
std::stringstream chatcmplid;
|
||||||
|
chatcmplid << "chatcmpl-" << random_string();
|
||||||
|
return chatcmplid.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string format_chatml(std::vector<json> messages)
|
||||||
|
{
|
||||||
|
std::ostringstream chatml_msgs;
|
||||||
|
|
||||||
|
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||||
|
chatml_msgs << "<|im_start|>"
|
||||||
|
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||||
|
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||||
|
<< "<|im_end|>\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||||
|
|
||||||
|
return chatml_msgs.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* llama.cpp completion api semantics */
|
||||||
|
json oaicompat_completion_params_parse(
|
||||||
|
const json &body /* openai api json semantics */)
|
||||||
|
{
|
||||||
|
json llama_params;
|
||||||
|
|
||||||
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||||
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
|
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||||
|
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||||
|
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||||
|
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||||
|
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||||
|
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||||
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||||
|
llama_params["seed"] = json_value(body, "seed", 0);
|
||||||
|
llama_params["stream"] = json_value(body, "stream", false);
|
||||||
|
llama_params["mirostat"] = json_value(body, "mirostat", false);
|
||||||
|
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
|
||||||
|
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
|
||||||
|
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
|
||||||
|
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
|
||||||
|
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
|
||||||
|
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||||
|
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||||
|
|
||||||
|
if (llama_params.count("grammar") != 0) {
|
||||||
|
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle 'stop' field
|
||||||
|
if (body.contains("stop") && body["stop"].is_string()) {
|
||||||
|
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||||
|
} else {
|
||||||
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure there is ChatML-specific end sequence among stop words
|
||||||
|
llama_params["stop"].push_back("<|im_end|>");
|
||||||
|
|
||||||
|
return llama_params;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
||||||
|
{
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
bool stopped_word = result.count("stopped_word") != 0;
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason = "length";
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
|
||||||
|
json choices =
|
||||||
|
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}})
|
||||||
|
: json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"message", json{{"content", content},
|
||||||
|
{"role", "assistant"}}}}});
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json res =
|
||||||
|
json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"model",
|
||||||
|
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||||
|
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||||
|
{"usage",
|
||||||
|
json{{"completion_tokens", num_tokens_predicted},
|
||||||
|
{"prompt_tokens", num_prompt_tokens},
|
||||||
|
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
||||||
|
{"id", gen_chatcmplid()}};
|
||||||
|
|
||||||
|
if (server_verbose) {
|
||||||
|
res["__verbose"] = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.contains("completion_probabilities")) {
|
||||||
|
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return value is vector as there is one case where we might need to generate two responses
|
||||||
|
static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||||
|
return std::vector<json>({response.result_json});
|
||||||
|
}
|
||||||
|
|
||||||
|
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||||
|
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
|
||||||
|
bool stopped_word = json_value(result, "stopped_word", false);
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason;
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
if (stopped_limit) {
|
||||||
|
finish_reason = "length";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json choices;
|
||||||
|
|
||||||
|
if (!finish_reason.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}});
|
||||||
|
} else {
|
||||||
|
if (first) {
|
||||||
|
if (content.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{{"role", "assistant"}}}}});
|
||||||
|
} else {
|
||||||
|
// We have to send this as two updates to conform to openai behavior
|
||||||
|
json initial_ret = json{{"choices", json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"role", "assistant"}
|
||||||
|
}}}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
json second_ret = json{
|
||||||
|
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"content", content}}}
|
||||||
|
}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({initial_ret, second_ret});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||||
|
// with empty content, we ignore these at the calee site.
|
||||||
|
if (content.empty()) {
|
||||||
|
return std::vector<json>({json::object()});
|
||||||
|
}
|
||||||
|
|
||||||
|
choices = json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta",
|
||||||
|
json{
|
||||||
|
{"content", content},
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json ret = json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({ret});
|
||||||
|
}
|
||||||
|
|
||||||
static json format_partial_response(
|
static json format_partial_response(
|
||||||
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
||||||
) {
|
) {
|
||||||
|
@ -2333,7 +2714,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false, false);
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2354,9 +2735,9 @@ int main(int argc, char **argv)
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
if (!result.error) {
|
if (!result.error) {
|
||||||
const std::string str =
|
const std::string str =
|
||||||
"data: " +
|
"data: " +
|
||||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
"\n\n";
|
"\n\n";
|
||||||
LOG_VERBOSE("data stream", {
|
LOG_VERBOSE("data stream", {
|
||||||
{ "to_send", str }
|
{ "to_send", str }
|
||||||
});
|
});
|
||||||
|
@ -2368,6 +2749,17 @@ int main(int argc, char **argv)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
if (!sink.write(str.c_str(), str.size()))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2385,10 +2777,102 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res)
|
||||||
|
{
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json models = {
|
||||||
|
{"object", "list"},
|
||||||
|
{"data", {
|
||||||
|
{
|
||||||
|
{"id", params.model_alias},
|
||||||
|
{"object", "model"},
|
||||||
|
{"created", t},
|
||||||
|
{"owned_by", "llamacpp"}
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
|
||||||
|
res.set_content(models.dump(), "application/json");
|
||||||
|
});
|
||||||
|
|
||||||
|
// TODO: add mount point without "/v1" prefix -- how?
|
||||||
|
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
|
{
|
||||||
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||||
|
|
||||||
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
|
|
||||||
|
if (!json_value(data, "stream", false)) {
|
||||||
|
std::string completion_text;
|
||||||
|
task_result result = llama.next_result(task_id);
|
||||||
|
|
||||||
|
if (!result.error && result.stop) {
|
||||||
|
json oaicompat_result = format_final_response_oaicompat(data, result);
|
||||||
|
|
||||||
|
res.set_content(oaicompat_result.dump(-1, ' ', false,
|
||||||
|
json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
|
} else {
|
||||||
|
res.status = 500;
|
||||||
|
res.set_content(result.result_json["content"], "text/plain");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
|
||||||
|
while (true) {
|
||||||
|
task_result llama_result = llama.next_result(task_id);
|
||||||
|
if (!llama_result.error) {
|
||||||
|
std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
|
||||||
|
|
||||||
|
for (auto it = result_array.begin(); it != result_array.end(); ++it)
|
||||||
|
{
|
||||||
|
if (!it->empty()) {
|
||||||
|
const std::string str =
|
||||||
|
"data: " +
|
||||||
|
it->dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||||
|
if (!sink.write(str.c_str(), str.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (llama_result.stop) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
llama_result.result_json.dump(-1, ' ', false,
|
||||||
|
json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||||
|
if (!sink.write(str.c_str(), str.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sink.done();
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto on_complete = [task_id, &llama](bool) {
|
||||||
|
// cancel request
|
||||||
|
llama.request_cancel(task_id);
|
||||||
|
};
|
||||||
|
|
||||||
|
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true, false);
|
const int task_id = llama.request_completion(data, true, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2492,7 +2976,7 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
return res.set_content(result.result_json.dump(), "application/json");
|
return res.set_content(result.result_json.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
|
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
8
examples/speculative/README.md
Normal file
8
examples/speculative/README.md
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# llama.cpp/examples/speculative
|
||||||
|
|
||||||
|
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
||||||
|
|
||||||
|
More info:
|
||||||
|
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/3624
|
|
@ -94,9 +94,22 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||||
|
|
||||||
|
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||||
|
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||||
|
|
||||||
|
if (add_bos_tgt != add_bos_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
@ -190,8 +203,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
if (!params.use_color) {
|
||||||
fflush(stdout);
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
if (id == llama_token_eos(model_tgt)) {
|
if (id == llama_token_eos(model_tgt)) {
|
||||||
has_eos = true;
|
has_eos = true;
|
||||||
|
@ -223,10 +237,18 @@ int main(int argc, char ** argv) {
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
++i_dft;
|
++i_dft;
|
||||||
|
if (params.use_color) {
|
||||||
|
// Color token according to its origin sequence
|
||||||
|
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (params.use_color) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
const bool add_bos = true;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
|
|
@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
|
||||||
opt_cb_data.last_save_iter = opt->iter;
|
opt_cb_data.last_save_iter = opt->iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (alloc) {
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_free(opt->ctx);
|
ggml_free(opt->ctx);
|
||||||
free_train_state(train);
|
free_train_state(train);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
51
ggml-alloc.c
51
ggml-alloc.c
|
@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, tensor);
|
add_allocated_tensor(alloc, tensor);
|
||||||
size_t cur_max = (char*)addr - (char*)alloc->data + size;
|
size_t cur_max = (char*)addr - (char*)alloc->base + size;
|
||||||
if (cur_max > alloc->max_size) {
|
if (cur_max > alloc->max_size) {
|
||||||
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
|
||||||
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, tensor);
|
remove_allocated_tensor(alloc, tensor);
|
||||||
#endif
|
#endif
|
||||||
|
@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
|
||||||
|
|
||||||
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||||
|
|
||||||
|
@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||||
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||||
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||||
|
|
||||||
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
|
||||||
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
if (update_backend) {
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
|
@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||||
|
@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
||||||
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
||||||
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// utils
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
||||||
|
size_t nbytes = 0;
|
||||||
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
if (t->data == NULL && t->view_src == NULL) {
|
||||||
|
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nbytes == 0) {
|
||||||
|
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
||||||
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
|
||||||
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
if (t->data == NULL) {
|
||||||
|
if (t->view_src == NULL) {
|
||||||
|
ggml_tallocr_alloc(tallocr, t);
|
||||||
|
} else {
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tallocr_free(tallocr);
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_backend;
|
struct ggml_backend;
|
||||||
struct ggml_backend_buffer;
|
struct ggml_backend_buffer;
|
||||||
|
struct ggml_backend_buffer_type;
|
||||||
|
|
||||||
//
|
//
|
||||||
// Legacy API
|
// Legacy API
|
||||||
|
@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
|
||||||
struct ggml_hash_set hash_set,
|
struct ggml_hash_set hash_set,
|
||||||
ggml_tallocr_t * hash_node_talloc);
|
ggml_tallocr_t * hash_node_talloc);
|
||||||
|
|
||||||
|
|
||||||
|
// Utils
|
||||||
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
|
||||||
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -12,31 +12,50 @@ extern "C" {
|
||||||
// Backend buffer
|
// Backend buffer
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// buffer type
|
||||||
|
typedef void * ggml_backend_buffer_type_context_t;
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_type_i {
|
||||||
|
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_type {
|
||||||
|
struct ggml_backend_buffer_type_i iface;
|
||||||
|
ggml_backend_buffer_type_context_t context;
|
||||||
|
};
|
||||||
|
|
||||||
|
// buffer
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||||
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||||
|
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
struct ggml_backend_buffer_i iface;
|
struct ggml_backend_buffer_i iface;
|
||||||
|
ggml_backend_buffer_type_t buft;
|
||||||
ggml_backend_t backend;
|
|
||||||
ggml_backend_buffer_context_t context;
|
ggml_backend_buffer_context_t context;
|
||||||
|
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
struct ggml_backend * backend,
|
ggml_backend_buffer_type_t buft,
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
ggml_backend_buffer_context_t context,
|
||||||
size_t size);
|
size_t size);
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
@ -49,20 +68,17 @@ extern "C" {
|
||||||
void (*free)(ggml_backend_t backend);
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
// buffer allocation
|
// buffer allocation
|
||||||
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||||
|
|
||||||
// get buffer alignment
|
// (optional) asynchroneous tensor data access
|
||||||
size_t (*get_alignment)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// tensor data access
|
|
||||||
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
|
||||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
void (*synchronize) (ggml_backend_t backend);
|
|
||||||
|
|
||||||
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
// (optional) asynchroneous tensor copy
|
||||||
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
void (*synchronize) (ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
@ -82,6 +98,15 @@ extern "C" {
|
||||||
ggml_backend_context_t context;
|
ggml_backend_context_t context;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend registry
|
||||||
|
//
|
||||||
|
|
||||||
|
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
||||||
|
|
||||||
|
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
771
ggml-backend.c
771
ggml-backend.c
File diff suppressed because it is too large
Load diff
|
@ -7,41 +7,44 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||||
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
typedef void * ggml_backend_graph_plan_t;
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer
|
// Backend buffer
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend_buffer;
|
// buffer type
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
|
||||||
// backend buffer functions
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend;
|
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
|
||||||
typedef void * ggml_backend_graph_plan_t;
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||||
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
@ -57,6 +60,7 @@ extern "C" {
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// CPU backend
|
||||||
|
@ -68,8 +72,23 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend registry
|
||||||
|
//
|
||||||
|
|
||||||
|
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||||
|
|
||||||
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||||
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
||||||
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend scheduler
|
// Backend scheduler
|
||||||
|
@ -131,6 +150,32 @@ extern "C" {
|
||||||
ggml_backend_sched_t sched,
|
ggml_backend_sched_t sched,
|
||||||
struct ggml_cgraph * graph);
|
struct ggml_cgraph * graph);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_graph_copy {
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
struct ggml_context * ctx_allocated;
|
||||||
|
struct ggml_context * ctx_unallocated;
|
||||||
|
struct ggml_cgraph * graph;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copy a graph to a different backend
|
||||||
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||||
|
|
||||||
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||||
|
|
||||||
|
// Compare the output of two backends
|
||||||
|
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||||
|
|
||||||
|
// Tensor initialization
|
||||||
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
|
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
1720
ggml-cuda.cu
1720
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
10
ggml-cuda.h
10
ggml-cuda.h
|
@ -49,7 +49,15 @@ GGML_API int ggml_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
|
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -232,7 +232,7 @@ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml
|
||||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
// return index, asserts if table is full
|
// return index, asserts if table is full
|
||||||
|
|
|
@ -99,6 +99,12 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
|
// helper to check if the device supports a specific family
|
||||||
|
// ideally, the user code should be doing these checks
|
||||||
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
|
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
758
ggml-metal.m
758
ggml-metal.m
File diff suppressed because it is too large
Load diff
1157
ggml-metal.metal
1157
ggml-metal.metal
File diff suppressed because it is too large
Load diff
|
@ -1,20 +1,18 @@
|
||||||
|
#include "ggml.h"
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits>
|
|
||||||
|
|
||||||
#define CL_TARGET_OPENCL_VERSION 110
|
#define CL_TARGET_OPENCL_VERSION 110
|
||||||
#include <clblast.h>
|
#include <clblast.h>
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
520
ggml.c
520
ggml.c
|
@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||||
|
|
||||||
//
|
|
||||||
// tensor access macros
|
|
||||||
//
|
|
||||||
|
|
||||||
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
||||||
|
|
||||||
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||||
|
@ -1615,6 +1597,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"GROUP_NORM",
|
"GROUP_NORM",
|
||||||
|
|
||||||
"MUL_MAT",
|
"MUL_MAT",
|
||||||
|
"MUL_MAT_ID",
|
||||||
"OUT_PROD",
|
"OUT_PROD",
|
||||||
|
|
||||||
"SCALE",
|
"SCALE",
|
||||||
|
@ -1642,6 +1625,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"POOL_1D",
|
"POOL_1D",
|
||||||
"POOL_2D",
|
"POOL_2D",
|
||||||
"UPSCALE",
|
"UPSCALE",
|
||||||
|
"ARGSORT",
|
||||||
|
|
||||||
"FLASH_ATTN",
|
"FLASH_ATTN",
|
||||||
"FLASH_FF",
|
"FLASH_FF",
|
||||||
|
@ -1668,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -1697,6 +1681,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"group_norm(x)",
|
"group_norm(x)",
|
||||||
|
|
||||||
"X*Y",
|
"X*Y",
|
||||||
|
"X[i]*Y",
|
||||||
"X*Y",
|
"X*Y",
|
||||||
|
|
||||||
"x*v",
|
"x*v",
|
||||||
|
@ -1724,6 +1709,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"pool_1d(x)",
|
"pool_1d(x)",
|
||||||
"pool_2d(x)",
|
"pool_2d(x)",
|
||||||
"upscale(x)",
|
"upscale(x)",
|
||||||
|
"argsort(x)",
|
||||||
|
|
||||||
"flash_attn(x)",
|
"flash_attn(x)",
|
||||||
"flash_ff(x)",
|
"flash_ff(x)",
|
||||||
|
@ -1750,10 +1736,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
|
||||||
|
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||||
|
"ABS",
|
||||||
|
"SGN",
|
||||||
|
"NEG",
|
||||||
|
"STEP",
|
||||||
|
"TANH",
|
||||||
|
"ELU",
|
||||||
|
"RELU",
|
||||||
|
"GELU",
|
||||||
|
"GELU_QUICK",
|
||||||
|
"SILU",
|
||||||
|
"LEAKY",
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
||||||
|
|
||||||
|
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
|
||||||
|
@ -1773,6 +1777,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
||||||
|
|
||||||
p[GGML_OP_ACC ] = true;
|
p[GGML_OP_ACC ] = true;
|
||||||
p[GGML_OP_MUL_MAT ] = true;
|
p[GGML_OP_MUL_MAT ] = true;
|
||||||
|
p[GGML_OP_MUL_MAT_ID ] = true;
|
||||||
p[GGML_OP_OUT_PROD ] = true;
|
p[GGML_OP_OUT_PROD ] = true;
|
||||||
p[GGML_OP_SET ] = true;
|
p[GGML_OP_SET ] = true;
|
||||||
p[GGML_OP_GET_ROWS_BACK ] = true;
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
||||||
|
@ -2025,6 +2030,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
||||||
return GGML_OP_SYMBOL[op];
|
return GGML_OP_SYMBOL[op];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
||||||
|
return GGML_UNARY_OP_NAME[op];
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||||
|
if (t->op == GGML_OP_UNARY) {
|
||||||
|
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||||
|
return ggml_unary_op_name(uop);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return ggml_op_name(t->op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||||
return ggml_type_size(tensor->type);
|
return ggml_type_size(tensor->type);
|
||||||
}
|
}
|
||||||
|
@ -3158,9 +3177,7 @@ static struct ggml_tensor * ggml_add_impl(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
// TODO: support less-strict constraint
|
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -3375,9 +3392,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
// TODO: support less-strict constraint
|
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -3422,7 +3437,7 @@ static struct ggml_tensor * ggml_div_impl(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -4060,6 +4075,49 @@ struct ggml_tensor * ggml_mul_mat(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_mul_mat_id
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_mul_mat_id(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * as[],
|
||||||
|
struct ggml_tensor * ids,
|
||||||
|
int id,
|
||||||
|
struct ggml_tensor * b) {
|
||||||
|
|
||||||
|
int64_t n_as = ids->ne[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||||
|
GGML_ASSERT(ggml_is_vector(ids));
|
||||||
|
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
||||||
|
GGML_ASSERT(id >= 0 && id < n_as);
|
||||||
|
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (as[0]->grad || b->grad) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, id);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MUL_MAT_ID;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src[0] = ids;
|
||||||
|
result->src[1] = b;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < n_as; i++) {
|
||||||
|
struct ggml_tensor * a = as[i];
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
||||||
|
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
||||||
|
GGML_ASSERT(!ggml_is_transposed(a));
|
||||||
|
result->src[i + 2] = a;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_out_prod
|
// ggml_out_prod
|
||||||
|
|
||||||
struct ggml_tensor * ggml_out_prod(
|
struct ggml_tensor * ggml_out_prod(
|
||||||
|
@ -4213,7 +4271,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t offset) {
|
size_t offset) {
|
||||||
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
|
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_cpy
|
// ggml_cpy
|
||||||
|
@ -4830,7 +4888,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
||||||
static struct ggml_tensor * ggml_soft_max_impl(
|
static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(a));
|
||||||
|
if (mask) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||||
|
GGML_ASSERT(mask->ne[2] == 1);
|
||||||
|
GGML_ASSERT(mask->ne[3] == 1);
|
||||||
|
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad) {
|
if (a->grad) {
|
||||||
|
@ -4839,9 +4907,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
float params[] = { scale };
|
||||||
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_SOFT_MAX;
|
result->op = GGML_OP_SOFT_MAX;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
|
result->src[1] = mask;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -4849,13 +4921,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_tensor * ggml_soft_max(
|
struct ggml_tensor * ggml_soft_max(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, false);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_inplace(
|
struct ggml_tensor * ggml_soft_max_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, true);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_soft_max_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale) {
|
||||||
|
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_soft_max_back
|
// ggml_soft_max_back
|
||||||
|
@ -5450,6 +5530,43 @@ struct ggml_tensor * ggml_upscale(
|
||||||
return ggml_upscale_impl(ctx, a, scale_factor);
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_argsort
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_argsort(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_sort_order order) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||||
|
|
||||||
|
result->op = GGML_OP_ARGSORT;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src[0] = a;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_top_k
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_top_k(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int k) {
|
||||||
|
GGML_ASSERT(a->ne[0] >= k);
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
||||||
|
|
||||||
|
result = ggml_view_4d(ctx, result,
|
||||||
|
k, result->ne[1], result->ne[2], result->ne[3],
|
||||||
|
result->nb[1], result->nb[2], result->nb[3],
|
||||||
|
0);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_flash_attn
|
// ggml_flash_attn
|
||||||
|
|
||||||
struct ggml_tensor * ggml_flash_attn(
|
struct ggml_tensor * ggml_flash_attn(
|
||||||
|
@ -6809,7 +6926,7 @@ static void ggml_compute_forward_add_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -6842,16 +6959,19 @@ static void ggml_compute_forward_add_f32(
|
||||||
const int64_t i13 = i03 % ne13;
|
const int64_t i13 = i03 % ne13;
|
||||||
const int64_t i12 = i02 % ne12;
|
const int64_t i12 = i02 % ne12;
|
||||||
const int64_t i11 = i01 % ne11;
|
const int64_t i11 = i01 % ne11;
|
||||||
|
const int64_t nr0 = ne00 / ne10;
|
||||||
|
|
||||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||||
|
|
||||||
|
for (int64_t r = 0; r < nr0; ++r) {
|
||||||
#ifdef GGML_USE_ACCELERATE
|
#ifdef GGML_USE_ACCELERATE
|
||||||
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
||||||
#else
|
#else
|
||||||
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// src1 is not contiguous
|
// src1 is not contiguous
|
||||||
|
@ -6868,8 +6988,9 @@ static void ggml_compute_forward_add_f32(
|
||||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
const int64_t i10 = i0 % ne10;
|
||||||
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||||
|
|
||||||
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
||||||
}
|
}
|
||||||
|
@ -7589,7 +7710,7 @@ static void ggml_compute_forward_mul_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -7612,7 +7733,6 @@ static void ggml_compute_forward_mul_f32(
|
||||||
|
|
||||||
GGML_ASSERT( nb0 == sizeof(float));
|
GGML_ASSERT( nb0 == sizeof(float));
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
GGML_ASSERT(ne00 == ne10);
|
|
||||||
|
|
||||||
if (nb10 == sizeof(float)) {
|
if (nb10 == sizeof(float)) {
|
||||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||||
|
@ -7624,20 +7744,21 @@ static void ggml_compute_forward_mul_f32(
|
||||||
const int64_t i13 = i03 % ne13;
|
const int64_t i13 = i03 % ne13;
|
||||||
const int64_t i12 = i02 % ne12;
|
const int64_t i12 = i02 % ne12;
|
||||||
const int64_t i11 = i01 % ne11;
|
const int64_t i11 = i01 % ne11;
|
||||||
|
const int64_t nr0 = ne00 / ne10;
|
||||||
|
|
||||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||||
|
|
||||||
|
for (int64_t r = 0 ; r < nr0; ++r) {
|
||||||
#ifdef GGML_USE_ACCELERATE
|
#ifdef GGML_USE_ACCELERATE
|
||||||
UNUSED(ggml_vec_mul_f32);
|
UNUSED(ggml_vec_mul_f32);
|
||||||
|
|
||||||
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
||||||
#else
|
#else
|
||||||
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||||
#endif
|
#endif
|
||||||
// }
|
}
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// src1 is not contiguous
|
// src1 is not contiguous
|
||||||
|
@ -7655,8 +7776,9 @@ static void ggml_compute_forward_mul_f32(
|
||||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
|
||||||
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
const int64_t i10 = i0 % ne10;
|
||||||
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||||
|
|
||||||
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
||||||
}
|
}
|
||||||
|
@ -7690,14 +7812,16 @@ static void ggml_compute_forward_div_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int nr = ggml_nrows(src0);
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
|
@ -7705,41 +7829,50 @@ static void ggml_compute_forward_div_f32(
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
|
||||||
if (nb10 == sizeof(float)) {
|
if (nb10 == sizeof(float)) {
|
||||||
for (int ir = 0; ir < nr; ++ir) {
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||||
// src0, src1 and dst are same shape => same indices
|
// src0 and dst are same shape => same indices
|
||||||
const int i3 = ir/(ne2*ne1);
|
const int64_t i03 = ir/(ne02*ne01);
|
||||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
|
const int64_t i13 = i03 % ne13;
|
||||||
|
const int64_t i12 = i02 % ne12;
|
||||||
|
const int64_t i11 = i01 % ne11;
|
||||||
|
const int64_t nr0 = ne00 / ne10;
|
||||||
|
|
||||||
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||||
|
|
||||||
|
for (int64_t r = 0; r < nr0; ++r) {
|
||||||
#ifdef GGML_USE_ACCELERATE
|
#ifdef GGML_USE_ACCELERATE
|
||||||
UNUSED(ggml_vec_div_f32);
|
UNUSED(ggml_vec_div_f32);
|
||||||
|
|
||||||
vDSP_vdiv(
|
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
||||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
|
||||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
|
||||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
|
||||||
ne0);
|
|
||||||
#else
|
#else
|
||||||
ggml_vec_div_f32(ne0,
|
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
|
||||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
|
||||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
|
||||||
#endif
|
#endif
|
||||||
// }
|
}
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// src1 is not contiguous
|
// src1 is not contiguous
|
||||||
for (int ir = 0; ir < nr; ++ir) {
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||||
// src0, src1 and dst are same shape => same indices
|
// src0 and dst are same shape => same indices
|
||||||
const int i3 = ir/(ne2*ne1);
|
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
const int64_t i03 = ir/(ne02*ne01);
|
||||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||||
|
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
const int64_t i13 = i03 % ne13;
|
||||||
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
const int64_t i12 = i02 % ne12;
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
const int64_t i11 = i01 % ne11;
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
|
|
||||||
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||||
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
|
||||||
|
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
||||||
|
const int64_t i10 = i0 % ne10;
|
||||||
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||||
|
|
||||||
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
||||||
}
|
}
|
||||||
|
@ -8185,7 +8318,7 @@ static void ggml_compute_forward_repeat_f16(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||||
const int nr0 = (int)(ne0/ne00);
|
const int nr0 = (int)(ne0/ne00);
|
||||||
|
@ -8330,6 +8463,7 @@ static void ggml_compute_forward_concat_f32(
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
|
@ -8339,7 +8473,7 @@ static void ggml_compute_forward_concat_f32(
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
for (int i3 = 0; i3 < ne3; i3++) {
|
for (int i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int i2 = ith; i2 < ne2; i2++) {
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||||
if (i2 < ne02) { // src0
|
if (i2 < ne02) { // src0
|
||||||
for (int i1 = 0; i1 < ne1; i1++) {
|
for (int i1 = 0; i1 < ne1; i1++) {
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
for (int i0 = 0; i0 < ne0; i0++) {
|
||||||
|
@ -9377,7 +9511,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||||
// TODO: find the optimal values for these
|
// TODO: find the optimal values for these
|
||||||
if (ggml_is_contiguous(src0) &&
|
if (ggml_is_contiguous(src0) &&
|
||||||
ggml_is_contiguous(src1) &&
|
ggml_is_contiguous(src1) &&
|
||||||
src0->type == GGML_TYPE_F32 &&
|
//src0->type == GGML_TYPE_F32 &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||||
|
|
||||||
|
@ -9499,6 +9633,8 @@ static void ggml_compute_forward_mul_mat(
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
|
||||||
|
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||||
|
@ -9600,6 +9736,26 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_mul_mat_id
|
||||||
|
|
||||||
|
static void ggml_compute_forward_mul_mat_id(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * ids = dst->src[0];
|
||||||
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
|
const int id = ggml_get_op_params_i32(dst, 0);
|
||||||
|
|
||||||
|
const int a_id = ((int32_t *)ids->data)[id];
|
||||||
|
|
||||||
|
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
||||||
|
|
||||||
|
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_out_prod
|
// ggml_compute_forward_out_prod
|
||||||
|
|
||||||
static void ggml_compute_forward_out_prod_f32(
|
static void ggml_compute_forward_out_prod_f32(
|
||||||
|
@ -10555,20 +10711,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
||||||
static void ggml_compute_forward_soft_max_f32(
|
static void ggml_compute_forward_soft_max_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
struct ggml_tensor * dst) {
|
const struct ggml_tensor * src1,
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
assert(ggml_is_contiguous(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float scale = 1.0f;
|
||||||
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
|
|
||||||
// TODO: handle transposed/permuted matrices
|
// TODO: handle transposed/permuted matrices
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
||||||
|
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
const int nr = ggml_nrows(src0);
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
@ -10579,29 +10740,40 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
const int ir0 = dr*ith;
|
const int ir0 = dr*ith;
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||||
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||||
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||||
|
|
||||||
|
// broadcast the mask across rows
|
||||||
|
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||||
|
|
||||||
|
ggml_vec_cpy_f32 (nc, wp, sp);
|
||||||
|
ggml_vec_scale_f32(nc, wp, scale);
|
||||||
|
if (mp) {
|
||||||
|
ggml_vec_acc_f32(nc, wp, mp);
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
for (int i = 0; i < nc; ++i) {
|
for (int i = 0; i < nc; ++i) {
|
||||||
//printf("p[%d] = %f\n", i, p[i]);
|
//printf("p[%d] = %f\n", i, p[i]);
|
||||||
assert(!isnan(sp[i]));
|
assert(!isnan(wp[i]));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
ggml_vec_max_f32(nc, &max, sp);
|
ggml_vec_max_f32(nc, &max, wp);
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
ggml_float sum = 0.0;
|
||||||
|
|
||||||
uint16_t scvt;
|
uint16_t scvt;
|
||||||
for (int i = 0; i < nc; i++) {
|
for (int i = 0; i < nc; i++) {
|
||||||
if (sp[i] == -INFINITY) {
|
if (wp[i] == -INFINITY) {
|
||||||
dp[i] = 0.0f;
|
dp[i] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
// const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
|
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
||||||
memcpy(&scvt, &s, sizeof(scvt));
|
memcpy(&scvt, &s, sizeof(scvt));
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
||||||
sum += (ggml_float)val;
|
sum += (ggml_float)val;
|
||||||
|
@ -10626,11 +10798,12 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
static void ggml_compute_forward_soft_max(
|
static void ggml_compute_forward_soft_max(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
struct ggml_tensor * dst) {
|
const struct ggml_tensor * src1,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -11986,6 +12159,67 @@ static void ggml_compute_forward_upscale(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_argsort
|
||||||
|
|
||||||
|
static void ggml_compute_forward_argsort_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
||||||
|
|
||||||
|
for (int64_t i = ith; i < nr; i += nth) {
|
||||||
|
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
||||||
|
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
||||||
|
|
||||||
|
for (int64_t j = 0; j < ne0; j++) {
|
||||||
|
dst_data[j] = j;
|
||||||
|
}
|
||||||
|
|
||||||
|
// C doesn't have a functional sort, so we do a bubble sort instead
|
||||||
|
for (int64_t j = 0; j < ne0; j++) {
|
||||||
|
for (int64_t k = j + 1; k < ne0; k++) {
|
||||||
|
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
||||||
|
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
||||||
|
int32_t tmp = dst_data[j];
|
||||||
|
dst_data[j] = dst_data[k];
|
||||||
|
dst_data[k] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_argsort(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_argsort_f32(params, src0, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_flash_attn
|
// ggml_compute_forward_flash_attn
|
||||||
|
|
||||||
static void ggml_compute_forward_flash_attn_f32(
|
static void ggml_compute_forward_flash_attn_f32(
|
||||||
|
@ -13821,6 +14055,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_mul_mat_id(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
|
@ -13879,7 +14117,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX_BACK:
|
case GGML_OP_SOFT_MAX_BACK:
|
||||||
{
|
{
|
||||||
|
@ -13925,6 +14163,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||||
|
@ -14575,6 +14817,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
@ -14913,6 +15159,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
struct ggml_tensor * flash_grad = NULL;
|
struct ggml_tensor * flash_grad = NULL;
|
||||||
|
@ -15273,12 +15523,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
||||||
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
||||||
const size_t obj_size = sizeof(struct ggml_cgraph);
|
struct ggml_cgraph cgraph = {
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
|
||||||
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
|
||||||
|
|
||||||
*cgraph = (struct ggml_cgraph) {
|
|
||||||
/*.size =*/ 0,
|
/*.size =*/ 0,
|
||||||
/*.n_nodes =*/ i1 - i0,
|
/*.n_nodes =*/ i1 - i0,
|
||||||
/*.n_leafs =*/ 0,
|
/*.n_leafs =*/ 0,
|
||||||
|
@ -15513,7 +15759,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_DIV:
|
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_SQRT:
|
case GGML_OP_SQRT:
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
|
@ -15546,10 +15791,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SILU_BACK:
|
case GGML_OP_SILU_BACK:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
case GGML_OP_RMS_NORM_BACK:
|
case GGML_OP_RMS_NORM_BACK:
|
||||||
|
@ -15592,6 +15840,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
{
|
||||||
|
// FIXME: blas
|
||||||
|
n_tasks = n_threads;
|
||||||
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -15611,7 +15864,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_ZERO:
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
|
||||||
case GGML_OP_SOFT_MAX_BACK:
|
case GGML_OP_SOFT_MAX_BACK:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_ROPE_BACK:
|
case GGML_OP_ROPE_BACK:
|
||||||
|
@ -15627,6 +15879,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = 1; //TODO
|
n_tasks = 1; //TODO
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
{
|
||||||
|
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -15648,6 +15904,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
{
|
||||||
|
n_tasks = n_threads;
|
||||||
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -15716,7 +15976,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
|
fprintf(stderr, "%s: op not implemented: ", __func__);
|
||||||
|
if (node->op < GGML_OP_COUNT) {
|
||||||
|
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%d\n", node->op);
|
||||||
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -15857,18 +16122,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
|
||||||
// thread scheduling for the different operations + work buffer size estimation
|
// thread scheduling for the different operations + work buffer size estimation
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
int n_tasks = 1;
|
|
||||||
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->type)) {
|
if (ggml_is_quantized(node->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
@ -15876,16 +16139,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
@ -15911,14 +16170,33 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
{
|
||||||
|
const struct ggml_tensor * a = node->src[2];
|
||||||
|
const struct ggml_tensor * b = node->src[1];
|
||||||
|
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
||||||
|
if (a->type != GGML_TYPE_F32) {
|
||||||
|
// here we need memory just for single 2D matrix from src0
|
||||||
|
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (b->type != vec_dot_type) {
|
||||||
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
{
|
||||||
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
||||||
|
@ -15944,10 +16222,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_IM2COL:
|
|
||||||
{
|
|
||||||
n_tasks = n_threads;
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
{
|
{
|
||||||
const int64_t ne00 = node->src[0]->ne[0]; // W
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
||||||
|
@ -15964,8 +16238,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||||
|
@ -15978,8 +16250,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_FF:
|
case GGML_OP_FLASH_FF:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||||
|
@ -15990,8 +16260,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN_BACK:
|
case GGML_OP_FLASH_ATTN_BACK:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
const int64_t D = node->src[0]->ne[0];
|
const int64_t D = node->src[0]->ne[0];
|
||||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
||||||
|
@ -16006,8 +16274,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_COUNT:
|
case GGML_OP_COUNT:
|
||||||
|
@ -17794,8 +18060,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||||
|
|
||||||
for (int j = 0; j < QK5_0; j += 2) {
|
for (int j = 0; j < QK5_0; j += 2) {
|
||||||
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
||||||
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
||||||
|
|
||||||
// cast to 16 bins
|
// cast to 16 bins
|
||||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
||||||
|
@ -17824,8 +18090,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
||||||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||||
|
|
||||||
for (int j = 0; j < QK5_1; j += 2) {
|
for (int j = 0; j < QK5_1; j += 2) {
|
||||||
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
||||||
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
||||||
|
|
||||||
// cast to 16 bins
|
// cast to 16 bins
|
||||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
||||||
|
@ -18015,6 +18281,7 @@ struct gguf_kv {
|
||||||
|
|
||||||
struct gguf_header {
|
struct gguf_header {
|
||||||
char magic[4];
|
char magic[4];
|
||||||
|
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
uint64_t n_tensors; // GGUFv2
|
uint64_t n_tensors; // GGUFv2
|
||||||
uint64_t n_kv; // GGUFv2
|
uint64_t n_kv; // GGUFv2
|
||||||
|
@ -18104,7 +18371,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
||||||
if (magic[i] != GGUF_MAGIC[i]) {
|
if (magic[i] != GGUF_MAGIC[i]) {
|
||||||
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||||
fclose(file);
|
fclose(file);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -18119,7 +18386,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
{
|
{
|
||||||
strncpy(ctx->header.magic, magic, 4);
|
strncpy(ctx->header.magic, magic, 4);
|
||||||
|
|
||||||
|
|
||||||
ctx->kv = NULL;
|
ctx->kv = NULL;
|
||||||
ctx->infos = NULL;
|
ctx->infos = NULL;
|
||||||
ctx->data = NULL;
|
ctx->data = NULL;
|
||||||
|
|
66
ggml.h
66
ggml.h
|
@ -244,11 +244,10 @@
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
||||||
fflush(stderr); \
|
|
||||||
fflush(stdout); \
|
fflush(stdout); \
|
||||||
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||||
ggml_print_backtrace(); \
|
ggml_print_backtrace(); \
|
||||||
exit(1); \
|
abort(); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
@ -284,6 +283,20 @@
|
||||||
const type prefix##3 = (pointer)->array[3]; \
|
const type prefix##3 = (pointer)->array[3]; \
|
||||||
GGML_UNUSED(prefix##3);
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
|
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -382,6 +395,7 @@ extern "C" {
|
||||||
GGML_OP_GROUP_NORM,
|
GGML_OP_GROUP_NORM,
|
||||||
|
|
||||||
GGML_OP_MUL_MAT,
|
GGML_OP_MUL_MAT,
|
||||||
|
GGML_OP_MUL_MAT_ID,
|
||||||
GGML_OP_OUT_PROD,
|
GGML_OP_OUT_PROD,
|
||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
|
@ -408,8 +422,8 @@ extern "C" {
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
|
GGML_OP_ARGSORT,
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
@ -449,7 +463,9 @@ extern "C" {
|
||||||
GGML_UNARY_OP_GELU,
|
GGML_UNARY_OP_GELU,
|
||||||
GGML_UNARY_OP_GELU_QUICK,
|
GGML_UNARY_OP_GELU_QUICK,
|
||||||
GGML_UNARY_OP_SILU,
|
GGML_UNARY_OP_SILU,
|
||||||
GGML_UNARY_OP_LEAKY
|
GGML_UNARY_OP_LEAKY,
|
||||||
|
|
||||||
|
GGML_UNARY_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_object_type {
|
enum ggml_object_type {
|
||||||
|
@ -632,6 +648,9 @@ extern "C" {
|
||||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||||
|
|
||||||
|
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||||
|
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||||
|
|
||||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||||
|
@ -1028,6 +1047,15 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// indirect matrix multiplication
|
||||||
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||||
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * as[],
|
||||||
|
struct ggml_tensor * ids,
|
||||||
|
int id,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// A: m columns, n rows,
|
// A: m columns, n rows,
|
||||||
// B: p columns, n rows,
|
// B: p columns, n rows,
|
||||||
// result is m columns, p rows
|
// result is m columns, p rows
|
||||||
|
@ -1283,6 +1311,14 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// fused soft_max(a*scale + mask)
|
||||||
|
// mask is optional
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1513,6 +1549,23 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int scale_factor);
|
int scale_factor);
|
||||||
|
|
||||||
|
// sort rows
|
||||||
|
enum ggml_sort_order {
|
||||||
|
GGML_SORT_ASC,
|
||||||
|
GGML_SORT_DESC,
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_argsort(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_sort_order order);
|
||||||
|
|
||||||
|
// top k elements per row
|
||||||
|
GGML_API struct ggml_tensor * ggml_top_k(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int k);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * q,
|
struct ggml_tensor * q,
|
||||||
|
@ -1574,7 +1627,6 @@ extern "C" {
|
||||||
int kh);
|
int kh);
|
||||||
|
|
||||||
// used in sam
|
// used in sam
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1749,7 +1801,7 @@ extern "C" {
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
|
|
|
@ -56,20 +56,21 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -91,6 +92,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
QWEN = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -131,6 +133,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -316,6 +319,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
],
|
],
|
||||||
|
@ -335,6 +352,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_ARCH.PERSIMMON: [
|
MODEL_ARCH.PERSIMMON: [
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -221,7 +221,7 @@ class GGUFWriter:
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
if self.use_temp_file and self.temp_file is None:
|
if self.use_temp_file and self.temp_file is None:
|
||||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||||
fp.seek(0)
|
fp.seek(0)
|
||||||
self.temp_file = fp
|
self.temp_file = fp
|
||||||
|
|
||||||
|
@ -399,6 +399,9 @@ class GGUFWriter:
|
||||||
def add_add_eos_token(self, value: bool) -> None:
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_chat_template(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -10,7 +10,7 @@ class TensorNameMap:
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
|
@ -38,7 +38,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||||
"output", # llama-pth bloom
|
"output", # llama-pth bloom
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
),
|
),
|
||||||
|
@ -51,7 +51,7 @@ class TensorNameMap:
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact bloom
|
"ln_f", # refact bloom qwen
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ class TensorNameMap:
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"h.{bid}.input_layernorm", # bloom
|
"h.{bid}.input_layernorm", # bloom
|
||||||
|
@ -85,7 +85,7 @@ class TensorNameMap:
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
"h.{bid}.self_attention.query_key_value", # bloom
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
|
@ -119,7 +119,7 @@ class TensorNameMap:
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
|
@ -139,7 +139,7 @@ class TensorNameMap:
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
||||||
"h.{bid}.post_attention_layernorm", # bloom
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
|
@ -161,18 +161,20 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
|
|
|
@ -13,6 +13,7 @@ class SpecialVocab:
|
||||||
merges: list[str]
|
merges: list[str]
|
||||||
add_special_token: dict[str, bool]
|
add_special_token: dict[str, bool]
|
||||||
special_token_ids: dict[str, int]
|
special_token_ids: dict[str, int]
|
||||||
|
chat_template: str | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
@ -24,6 +25,7 @@ class SpecialVocab:
|
||||||
self.n_vocab = n_vocab
|
self.n_vocab = n_vocab
|
||||||
self.load_merges = load_merges
|
self.load_merges = load_merges
|
||||||
self.merges = []
|
self.merges = []
|
||||||
|
self.chat_template = None
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
else:
|
else:
|
||||||
|
@ -67,6 +69,10 @@ class SpecialVocab:
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||||
add_handler(value)
|
add_handler(value)
|
||||||
|
if self.chat_template is not None:
|
||||||
|
if not quiet:
|
||||||
|
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||||
|
gw.add_chat_template(self.chat_template)
|
||||||
|
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
self._try_load_from_tokenizer_json(path)
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
@ -132,6 +138,14 @@ class SpecialVocab:
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
|
chat_template = tokenizer_config.get('chat_template')
|
||||||
|
if chat_template is None or isinstance(chat_template, str):
|
||||||
|
self.chat_template = chat_template
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
|
||||||
|
file = sys.stderr
|
||||||
|
)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
if isinstance(add_entry, bool):
|
if isinstance(add_entry, bool):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.5.3"
|
version = "0.6.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
92
llama.h
92
llama.h
|
@ -42,7 +42,7 @@
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 2
|
#define LLAMA_SESSION_VERSION 3
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
@ -158,6 +158,22 @@ extern "C" {
|
||||||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
|
enum llama_model_kv_override_type {
|
||||||
|
LLAMA_KV_OVERRIDE_INT,
|
||||||
|
LLAMA_KV_OVERRIDE_FLOAT,
|
||||||
|
LLAMA_KV_OVERRIDE_BOOL,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_model_kv_override {
|
||||||
|
char key[128];
|
||||||
|
enum llama_model_kv_override_type tag;
|
||||||
|
union {
|
||||||
|
int64_t int_value;
|
||||||
|
double float_value;
|
||||||
|
bool bool_value;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_model_params {
|
struct llama_model_params {
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
@ -165,9 +181,13 @@ extern "C" {
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// override key-value pairs of the model meta data
|
||||||
|
const struct llama_model_kv_override * kv_overrides;
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
|
@ -185,17 +205,20 @@ extern "C" {
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
enum ggml_type type_k; // data type for K cache
|
||||||
|
enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool embedding; // embedding mode only
|
||||||
bool embedding; // embedding mode only
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
@ -361,9 +384,60 @@ extern "C" {
|
||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache
|
// Information associated with an individual cell in the KV cache view.
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
struct llama_kv_cache_view_cell {
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
// The position for this cell. Takes KV cache shifts into account.
|
||||||
|
// May be negative if the cell is not populated.
|
||||||
|
llama_pos pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
// An updateable view of the KV cache.
|
||||||
|
struct llama_kv_cache_view {
|
||||||
|
// Number of KV cache cells. This will be the same as the context size.
|
||||||
|
int32_t n_cells;
|
||||||
|
|
||||||
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
||||||
|
// if there are more sequences in a cell than this value, however they will
|
||||||
|
// not be visible in the view cells_sequences.
|
||||||
|
int32_t n_max_seq;
|
||||||
|
|
||||||
|
// Number of tokens in the cache. For example, if there are two populated
|
||||||
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
||||||
|
// ids then you'll have 3 tokens.
|
||||||
|
int32_t token_count;
|
||||||
|
|
||||||
|
// Number of populated cache cells.
|
||||||
|
int32_t used_cells;
|
||||||
|
|
||||||
|
// Maximum contiguous empty slots in the cache.
|
||||||
|
int32_t max_contiguous;
|
||||||
|
|
||||||
|
// Index to the start of the max_contiguous slot range. Can be negative
|
||||||
|
// when cache is full.
|
||||||
|
int32_t max_contiguous_idx;
|
||||||
|
|
||||||
|
// Information for an individual cell.
|
||||||
|
struct llama_kv_cache_view_cell * cells;
|
||||||
|
|
||||||
|
// The sequences for each cell. There will be n_max_seq items per cell.
|
||||||
|
llama_seq_id * cells_sequences;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create an empty KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
||||||
|
|
||||||
|
// Free a KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||||
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||||
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||||
|
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||||
|
|
||||||
// Clear the KV cache
|
// Clear the KV cache
|
||||||
LLAMA_API void llama_kv_cache_clear(
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
|
|
1
prompts/chat-with-qwen.txt
Normal file
1
prompts/chat-with-qwen.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
You are a helpful assistant.
|
3
requirements-hf-to-gguf.txt
Normal file
3
requirements-hf-to-gguf.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
-r requirements.txt
|
||||||
|
torch==2.1.1
|
||||||
|
transformers==4.35.2
|
|
@ -1,5 +1,3 @@
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
|
||||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
|
||||||
set(BUILD_NUMBER 0)
|
set(BUILD_NUMBER 0)
|
||||||
set(BUILD_COMMIT "unknown")
|
set(BUILD_COMMIT "unknown")
|
||||||
set(BUILD_COMPILER "unknown")
|
set(BUILD_COMPILER "unknown")
|
||||||
|
@ -58,23 +56,3 @@ else()
|
||||||
)
|
)
|
||||||
set(BUILD_TARGET ${OUT})
|
set(BUILD_TARGET ${OUT})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Only write the build info if it changed
|
|
||||||
if(EXISTS ${OUTPUT_FILE})
|
|
||||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
|
||||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
|
||||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
|
||||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
|
||||||
if (
|
|
||||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
|
||||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
|
||||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
|
||||||
)
|
|
||||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
||||||
endif()
|
|
||||||
|
|
24
scripts/gen-build-info-cpp.cmake
Normal file
24
scripts/gen-build-info-cpp.cmake
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||||
|
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||||
|
|
||||||
|
# Only write the build info if it changed
|
||||||
|
if(EXISTS ${OUTPUT_FILE})
|
||||||
|
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||||
|
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
|
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||||
|
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
|
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||||
|
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
|
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||||
|
if (
|
||||||
|
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||||
|
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||||
|
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||||
|
)
|
||||||
|
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||||
|
endif()
|
|
@ -20,5 +20,6 @@ cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||||
|
|
||||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||||
|
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
||||||
|
|
|
@ -22,26 +22,32 @@ endfunction()
|
||||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||||
llama_build_and_test_executable(test-sampling.cpp)
|
llama_build_and_test_executable(test-sampling.cpp)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
llama_test_executable (test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||||
|
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp)
|
||||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||||
|
llama_build_and_test_executable(test-backend-ops.cpp)
|
||||||
|
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
|
|
1357
tests/test-backend-ops.cpp
Normal file
1357
tests/test-backend-ops.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -14,34 +14,34 @@ dir_tokenizer = args.dir_tokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
"Hello World",
|
"Hello World",
|
||||||
" Hello World",
|
" Hello World",
|
||||||
" Hello World!",
|
" Hello World!",
|
||||||
"Hello, world!",
|
"Hello, world!",
|
||||||
" Hello, world!",
|
" Hello, world!",
|
||||||
" this is 🦙.cpp",
|
" this is 🦙.cpp",
|
||||||
"w048 7tuijk dsdfhu",
|
"w048 7tuijk dsdfhu",
|
||||||
"нещо на Български",
|
"нещо на Български",
|
||||||
"កាន់តែពិសេសអាចខលចេញ",
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
"Hello",
|
"Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
"\n =",
|
"\n =",
|
||||||
"' era",
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
print('text: ', text)
|
print('text: ', text)
|
||||||
|
|
|
@ -14,32 +14,32 @@ dir_tokenizer = args.dir_tokenizer
|
||||||
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
"Hello World",
|
"Hello World",
|
||||||
" Hello World",
|
" Hello World",
|
||||||
" Hello World!",
|
" Hello World!",
|
||||||
"Hello, world!",
|
"Hello, world!",
|
||||||
" Hello, world!",
|
" Hello, world!",
|
||||||
" this is 🦙.cpp",
|
" this is 🦙.cpp",
|
||||||
"w048 7tuijk dsdfhu",
|
"w048 7tuijk dsdfhu",
|
||||||
"нещо на Български",
|
"нещо на Български",
|
||||||
"កាន់តែពិសេសអាចខលចេញ",
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
"Hello",
|
"Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue