Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
4389bdac81
70 changed files with 16617 additions and 3228 deletions
5
.github/workflows/build.yml
vendored
5
.github/workflows/build.yml
vendored
|
@ -276,6 +276,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||||
|
|
||||||
|
- name: Build Swift Example
|
||||||
|
id: make_build_swift_example
|
||||||
|
run: |
|
||||||
|
make swift
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -44,6 +44,7 @@ models-mnt
|
||||||
/infill
|
/infill
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-bench
|
||||||
|
/llava
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
/perplexity
|
/perplexity
|
||||||
|
@ -55,6 +56,7 @@ models-mnt
|
||||||
/server
|
/server
|
||||||
/simple
|
/simple
|
||||||
/batched
|
/batched
|
||||||
|
/batched-bench
|
||||||
/export-lora
|
/export-lora
|
||||||
/finetune
|
/finetune
|
||||||
/speculative
|
/speculative
|
||||||
|
|
|
@ -422,8 +422,7 @@ endif()
|
||||||
if (LLAMA_ALL_WARNINGS)
|
if (LLAMA_ALL_WARNINGS)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
|
||||||
-Werror=implicit-function-declaration)
|
|
||||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
||||||
set(host_cxx_flags "")
|
set(host_cxx_flags "")
|
||||||
|
|
||||||
|
@ -455,7 +454,8 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
set(c_flags ${c_flags} ${warning_flags})
|
set(c_flags ${c_flags} ${warning_flags})
|
||||||
set(cxx_flags ${cxx_flags} ${warning_flags})
|
set(cxx_flags ${cxx_flags} ${warning_flags})
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||||
|
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
109
Makefile
109
Makefile
|
@ -1,8 +1,14 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
BUILD_TARGETS = \
|
||||||
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
|
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
|
||||||
|
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
TEST_TARGETS = \
|
||||||
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -172,6 +178,24 @@ else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_THREAD
|
||||||
|
MK_CFLAGS += -fsanitize=thread -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=thread -g
|
||||||
|
MK_LDFLAGS += -fsanitize=thread -g
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_ADDRESS
|
||||||
|
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_UNDEFINED
|
||||||
|
MK_CFLAGS += -fsanitize=undefined -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=undefined -g
|
||||||
|
MK_LDFLAGS += -fsanitize=undefined -g
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_VERBOSE
|
ifdef LLAMA_SERVER_VERBOSE
|
||||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||||
endif
|
endif
|
||||||
|
@ -520,7 +544,13 @@ OBJS += ggml-alloc.o ggml-backend.o
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
||||||
|
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
|
||||||
|
|
||||||
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
console.o: common/console.cpp common/console.h
|
console.o: common/console.cpp common/console.h
|
||||||
|
@ -542,19 +572,22 @@ clean:
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
|
|
||||||
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
|
@ -563,53 +596,56 @@ quantize: examples/quantize/quantize.cpp build-info.h ggml.
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
|
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
|
@ -617,6 +653,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(UNAME_S),Darwin)
|
||||||
|
swift: examples/batched.swift
|
||||||
|
(cd examples/batched.swift; make build)
|
||||||
|
endif
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
@ -637,7 +678,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: benchmark-matmult
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
@ -645,40 +686,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
// swift-tools-version:5.3
|
// swift-tools-version:5.5
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
#if arch(arm) || arch(arm64)
|
#if arch(arm) || arch(arm64)
|
||||||
let platforms: [SupportedPlatform]? = [
|
let platforms: [SupportedPlatform]? = [
|
||||||
.macOS(.v11),
|
.macOS(.v12),
|
||||||
.iOS(.v14),
|
.iOS(.v14),
|
||||||
.watchOS(.v4),
|
.watchOS(.v4),
|
||||||
.tvOS(.v14)
|
.tvOS(.v14)
|
||||||
|
@ -41,12 +41,13 @@ let package = Package(
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
"k_quants.c",
|
"k_quants.c",
|
||||||
] + additionalSources,
|
] + additionalSources,
|
||||||
resources: resources,
|
resources: resources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
.define("GGML_USE_K_QUANTS"),
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE")
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
|
34
README.md
34
README.md
|
@ -11,12 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
|
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
||||||
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
|
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
||||||
**Devs should become familiar with the new API**
|
|
||||||
- Local Falcon 180B inference on Mac Studio
|
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
|
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
@ -89,13 +85,17 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
||||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||||
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
|
||||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
|
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||||
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
|
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||||
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||||
|
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
||||||
|
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||||
|
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||||
|
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -204,7 +204,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Here are the steps for the LLaMA-7B model.
|
Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
|
||||||
|
|
||||||
### Get the Code
|
### Get the Code
|
||||||
|
|
||||||
|
@ -277,7 +277,7 @@ In order to build llama.cpp you have three different options.
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
||||||
|
|
||||||
When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
||||||
argument.
|
argument.
|
||||||
|
|
||||||
### MPI Build
|
### MPI Build
|
||||||
|
@ -571,6 +571,18 @@ python3 convert.py models/7B/
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
|
||||||
|
### Running on Windows with prebuilt binaries
|
||||||
|
|
||||||
|
You will find prebuilt Windows binaries on the release page.
|
||||||
|
|
||||||
|
Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
|
||||||
|
|
||||||
|
From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
.\main -m llama-2-7b.Q4_0.gguf -n 128
|
||||||
|
```
|
||||||
|
|
||||||
### Memory/Disk Requirements
|
### Memory/Disk Requirements
|
||||||
|
|
||||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
|
@ -128,17 +128,18 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
16
ci/run.sh
16
ci/run.sh
|
@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
@ -496,10 +502,12 @@ test $ret -eq 0 && gg_run ctest_debug
|
||||||
test $ret -eq 0 && gg_run ctest_release
|
test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
else
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
else
|
||||||
|
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ set(TARGET common)
|
||||||
add_library(${TARGET} OBJECT
|
add_library(${TARGET} OBJECT
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
sampling.h
|
||||||
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
grammar-parser.h
|
||||||
|
|
|
@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_k = std::stoi(argv[i]);
|
sparams.top_k = std::stoi(argv[i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx-size") {
|
} else if (arg == "-c" || arg == "--ctx-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.temp = std::stof(argv[i]);
|
sparams.temp = std::stof(argv[i]);
|
||||||
} else if (arg == "--tfs") {
|
} else if (arg == "--tfs") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.tfs_z = std::stof(argv[i]);
|
sparams.tfs_z = std::stof(argv[i]);
|
||||||
} else if (arg == "--typical") {
|
} else if (arg == "--typical") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.typical_p = std::stof(argv[i]);
|
sparams.typical_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--repeat-last-n") {
|
} else if (arg == "--repeat-last-n") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_last_n = std::stoi(argv[i]);
|
sparams.repeat_last_n = std::stoi(argv[i]);
|
||||||
} else if (arg == "--repeat-penalty") {
|
} else if (arg == "--repeat-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_penalty = std::stof(argv[i]);
|
sparams.repeat_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--frequency-penalty") {
|
} else if (arg == "--frequency-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.frequency_penalty = std::stof(argv[i]);
|
sparams.frequency_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--presence-penalty") {
|
} else if (arg == "--presence-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.presence_penalty = std::stof(argv[i]);
|
sparams.presence_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat") {
|
} else if (arg == "--mirostat") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat = std::stoi(argv[i]);
|
sparams.mirostat = std::stoi(argv[i]);
|
||||||
} else if (arg == "--mirostat-lr") {
|
} else if (arg == "--mirostat-lr") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_eta = std::stof(argv[i]);
|
sparams.mirostat_eta = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat-ent") {
|
} else if (arg == "--mirostat-ent") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_tau = std::stof(argv[i]);
|
sparams.mirostat_tau = std::stof(argv[i]);
|
||||||
} else if (arg == "--cfg-negative-prompt") {
|
} else if (arg == "--cfg-negative-prompt") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_negative_prompt = argv[i];
|
sparams.cfg_negative_prompt = argv[i];
|
||||||
} else if (arg == "--cfg-negative-prompt-file") {
|
} else if (arg == "--cfg-negative-prompt-file") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
||||||
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
||||||
params.cfg_negative_prompt.pop_back();
|
sparams.cfg_negative_prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "--cfg-scale") {
|
} else if (arg == "--cfg-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_scale = std::stof(argv[i]);
|
sparams.cfg_scale = std::stof(argv[i]);
|
||||||
} else if (arg == "-b" || arg == "--batch-size") {
|
} else if (arg == "-b" || arg == "--batch-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -383,6 +384,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_base = argv[i];
|
params.lora_base = argv[i];
|
||||||
|
} else if (arg == "--mmproj") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.mmproj = argv[i];
|
||||||
|
} else if (arg == "--image") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.image = argv[i];
|
||||||
} else if (arg == "-i" || arg == "--interactive") {
|
} else if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--embedding") {
|
} else if (arg == "--embedding") {
|
||||||
|
@ -512,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--no-penalize-nl") {
|
} else if (arg == "--no-penalize-nl") {
|
||||||
params.penalize_nl = false;
|
sparams.penalize_nl = false;
|
||||||
} else if (arg == "-l" || arg == "--logit-bias") {
|
} else if (arg == "-l" || arg == "--logit-bias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -524,7 +537,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string value_str;
|
std::string value_str;
|
||||||
try {
|
try {
|
||||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||||
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||||
} else {
|
} else {
|
||||||
throw std::exception();
|
throw std::exception();
|
||||||
}
|
}
|
||||||
|
@ -627,6 +640,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
|
@ -659,19 +674,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
||||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
||||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
||||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
||||||
printf(" --mirostat N use Mirostat sampling.\n");
|
printf(" --mirostat N use Mirostat sampling.\n");
|
||||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||||
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
|
||||||
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
|
||||||
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||||
printf(" modifies the likelihood of token appearing in the completion,\n");
|
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||||
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||||
|
@ -682,7 +697,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" negative prompt to use for guidance. (default: empty)\n");
|
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||||
|
@ -690,7 +705,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||||
|
@ -700,6 +715,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
|
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||||
if (llama_mlock_supported()) {
|
if (llama_mlock_supported()) {
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
|
@ -803,6 +820,27 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_batch_clear(struct llama_batch & batch) {
|
||||||
|
batch.n_tokens = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_batch_add(
|
||||||
|
struct llama_batch & batch,
|
||||||
|
llama_token id,
|
||||||
|
llama_pos pos,
|
||||||
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
|
bool logits) {
|
||||||
|
batch.token [batch.n_tokens] = id;
|
||||||
|
batch.pos [batch.n_tokens] = pos,
|
||||||
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||||
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||||
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||||
|
}
|
||||||
|
batch.logits [batch.n_tokens] = logits;
|
||||||
|
|
||||||
|
batch.n_tokens++;
|
||||||
|
}
|
||||||
|
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
|
@ -840,7 +878,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
if (params.ignore_eos) {
|
||||||
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -862,21 +900,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos) {
|
bool add_bos,
|
||||||
return llama_tokenize(llama_get_model(ctx), text, add_bos);
|
bool special) {
|
||||||
|
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos) {
|
bool add_bos,
|
||||||
|
bool special) {
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + add_bos;
|
int n_tokens = text.length() + add_bos;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
|
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
|
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -932,127 +972,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx) {
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
|
|
||||||
llama_token id = 0;
|
|
||||||
|
|
||||||
float * logits = llama_get_logits_ith(ctx, idx);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
candidates.clear();
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply penalties
|
|
||||||
if (!last_tokens.empty()) {
|
|
||||||
const float nl_logit = logits[llama_token_nl(ctx)];
|
|
||||||
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
|
|
||||||
llama_sample_repetition_penalty(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
|
|
||||||
if (!penalize_nl) {
|
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
|
||||||
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
|
||||||
cur_p.data[idx].logit = nl_logit;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_sample_grammar(ctx, &cur_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0) {
|
|
||||||
// Greedy sampling
|
|
||||||
id = llama_sample_token_greedy(ctx, &cur_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
} else {
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, params.n_probs);
|
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
|
|
||||||
{
|
|
||||||
const int n_top = 10;
|
|
||||||
LOG("top %d candidates:\n", n_top);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_top; i++) {
|
|
||||||
const llama_token id = cur_p.data[i].id;
|
|
||||||
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
id = llama_sample_token(ctx, &cur_p);
|
|
||||||
|
|
||||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// printf("`%d`", candidates_p.size);
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
@ -1204,6 +1123,8 @@ std::string get_sortable_timestamp() {
|
||||||
|
|
||||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
|
@ -1250,21 +1171,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
|
|
||||||
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
||||||
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
||||||
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
||||||
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
||||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
||||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
||||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||||
|
|
||||||
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
||||||
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||||
|
|
||||||
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
||||||
|
@ -1277,7 +1198,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||||
|
|
||||||
fprintf(stream, "logit_bias:\n");
|
fprintf(stream, "logit_bias:\n");
|
||||||
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
||||||
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1301,30 +1222,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||||
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
||||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||||
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
||||||
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
||||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
||||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
||||||
|
|
||||||
fprintf(stream, "reverse_prompt:\n");
|
fprintf(stream, "reverse_prompt:\n");
|
||||||
for (std::string ap : params.antiprompt) {
|
for (std::string ap : params.antiprompt) {
|
||||||
|
@ -1342,15 +1263,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
@ -49,31 +51,12 @@ struct gpt_params {
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// sampling parameters
|
// // sampling parameters
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
struct llama_sampling_params sampling_params;
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
|
||||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
|
||||||
|
|
||||||
// Classifier-Free Guidance
|
|
||||||
// https://arxiv.org/abs/2306.17806
|
|
||||||
std::string cfg_negative_prompt; // string to help guidance
|
|
||||||
float cfg_scale = 1.f; // How strong is guidance
|
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
|
@ -87,6 +70,7 @@ struct gpt_params {
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
|
|
||||||
|
// TODO: avoid tuple, use struct
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
|
@ -115,13 +99,16 @@ struct gpt_params {
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
|
|
||||||
|
// multimodal models (see examples/llava)
|
||||||
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
|
std::string image = ""; // path to an image file
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
@ -138,10 +125,23 @@ void process_escapes(std::string& input);
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// TODO: avoid tuplue, use struct
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
||||||
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params);
|
|
||||||
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
|
// Batch utils
|
||||||
|
|
||||||
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
|
||||||
|
void llama_batch_add(
|
||||||
|
struct llama_batch & batch,
|
||||||
|
llama_token id,
|
||||||
|
llama_pos pos,
|
||||||
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
|
bool logits);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
@ -151,12 +151,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos);
|
bool add_bos,
|
||||||
|
bool special = false);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos);
|
bool add_bos,
|
||||||
|
bool special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece
|
// tokenizes a token into a piece
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
@ -180,36 +182,6 @@ std::string llama_detokenize_bpe(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens);
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
|
||||||
//
|
|
||||||
// required:
|
|
||||||
// - ctx: context to use for sampling
|
|
||||||
// - params: sampling parameters
|
|
||||||
//
|
|
||||||
// optional:
|
|
||||||
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
|
||||||
// - grammar: grammar to use for sampling, ignore if NULL
|
|
||||||
// - last_tokens: needed for repetition penalty, ignore if empty
|
|
||||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
|
||||||
//
|
|
||||||
// returns:
|
|
||||||
// - token: sampled token
|
|
||||||
// - candidates: vector of candidate tokens
|
|
||||||
//
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx = 0);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -399,7 +399,7 @@ namespace grammar_parser {
|
||||||
void print_grammar(FILE * file, const parse_state & state) {
|
void print_grammar(FILE * file, const parse_state & state) {
|
||||||
try {
|
try {
|
||||||
std::map<uint32_t, std::string> symbol_id_names;
|
std::map<uint32_t, std::string> symbol_id_names;
|
||||||
for (auto kv : state.symbol_ids) {
|
for (const auto & kv : state.symbol_ids) {
|
||||||
symbol_id_names[kv.second] = kv.first;
|
symbol_id_names[kv.second] = kv.first;
|
||||||
}
|
}
|
||||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
||||||
|
|
101
common/log.h
101
common/log.h
|
@ -579,38 +579,75 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
||||||
return buf.str();
|
return buf.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
|
template <typename C, typename T>
|
||||||
[&tokens, &ctx]() \
|
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
||||||
{ \
|
{
|
||||||
std::stringstream buf; \
|
std::stringstream buf;
|
||||||
buf << "[ "; \
|
buf << "[ ";
|
||||||
\
|
|
||||||
bool first = true; \
|
bool first = true;
|
||||||
for (const auto &token : tokens) \
|
for (const auto &token : tokens)
|
||||||
{ \
|
{
|
||||||
if (!first) \
|
if (!first) {
|
||||||
buf << ", "; \
|
buf << ", ";
|
||||||
else \
|
} else {
|
||||||
first = false; \
|
first = false;
|
||||||
\
|
}
|
||||||
auto detokenized = llama_token_to_piece(ctx, token); \
|
|
||||||
\
|
auto detokenized = llama_token_to_piece(ctx, token);
|
||||||
detokenized.erase( \
|
|
||||||
std::remove_if( \
|
detokenized.erase(
|
||||||
detokenized.begin(), \
|
std::remove_if(
|
||||||
detokenized.end(), \
|
detokenized.begin(),
|
||||||
[](const unsigned char c) { return !std::isprint(c); }), \
|
detokenized.end(),
|
||||||
detokenized.end()); \
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
\
|
detokenized.end());
|
||||||
buf \
|
|
||||||
<< "'" << detokenized << "'" \
|
buf
|
||||||
<< ":" << std::to_string(token); \
|
<< "'" << detokenized << "'"
|
||||||
} \
|
<< ":" << std::to_string(token);
|
||||||
buf << " ]"; \
|
}
|
||||||
\
|
buf << " ]";
|
||||||
return buf.str(); \
|
|
||||||
}() \
|
return buf.str();
|
||||||
.c_str()
|
}
|
||||||
|
|
||||||
|
template <typename C, typename B>
|
||||||
|
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
|
||||||
|
{
|
||||||
|
std::stringstream buf;
|
||||||
|
buf << "[ ";
|
||||||
|
|
||||||
|
bool first = true;
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i)
|
||||||
|
{
|
||||||
|
if (!first) {
|
||||||
|
buf << ", ";
|
||||||
|
} else {
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
||||||
|
|
||||||
|
detokenized.erase(
|
||||||
|
std::remove_if(
|
||||||
|
detokenized.begin(),
|
||||||
|
detokenized.end(),
|
||||||
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
|
detokenized.end());
|
||||||
|
|
||||||
|
buf
|
||||||
|
<< "\n" << std::to_string(i)
|
||||||
|
<< ":token '" << detokenized << "'"
|
||||||
|
<< ":pos " << std::to_string(batch.pos[i])
|
||||||
|
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||||
|
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
||||||
|
<< ":logits " << std::to_string(batch.logits[i]);
|
||||||
|
}
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef LOG_DISABLE_LOGS
|
#ifdef LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
|
193
common/sampling.cpp
Normal file
193
common/sampling.cpp
Normal file
|
@ -0,0 +1,193 @@
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
|
||||||
|
struct llama_sampling_context * result = new llama_sampling_context();
|
||||||
|
|
||||||
|
result->params = params.sampling_params;
|
||||||
|
result->grammar = nullptr;
|
||||||
|
|
||||||
|
// if there is a grammar, parse it
|
||||||
|
if (!params.grammar.empty()) {
|
||||||
|
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||||
|
|
||||||
|
// will be empty (default) if there are parse errors
|
||||||
|
if (result->parsed_grammar.rules.empty()) {
|
||||||
|
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||||
|
|
||||||
|
result->grammar = llama_grammar_init(
|
||||||
|
grammar_rules.data(),
|
||||||
|
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
}
|
||||||
|
|
||||||
|
result->prev.resize(params.n_ctx);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||||
|
if (ctx->grammar != NULL) {
|
||||||
|
llama_grammar_free(ctx->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
|
if (ctx->grammar != NULL) {
|
||||||
|
llama_grammar_free(ctx->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
|
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||||
|
|
||||||
|
ctx->grammar = llama_grammar_init(
|
||||||
|
grammar_rules.data(),
|
||||||
|
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
|
ctx->cur.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
||||||
|
if (dst->grammar) {
|
||||||
|
llama_grammar_free(dst->grammar);
|
||||||
|
dst->grammar = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src->grammar) {
|
||||||
|
dst->grammar = llama_grammar_copy(src->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
dst->prev = src->prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
const int n_ctx = llama_n_ctx(ctx_main);
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
|
const float temp = params.temp;
|
||||||
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
|
const float top_p = params.top_p;
|
||||||
|
const float tfs_z = params.tfs_z;
|
||||||
|
const float typical_p = params.typical_p;
|
||||||
|
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
|
const float repeat_penalty = params.repeat_penalty;
|
||||||
|
const float alpha_presence = params.presence_penalty;
|
||||||
|
const float alpha_frequency = params.frequency_penalty;
|
||||||
|
const int mirostat = params.mirostat;
|
||||||
|
const float mirostat_tau = params.mirostat_tau;
|
||||||
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
|
auto & prev = ctx_sampling->prev;
|
||||||
|
auto & cur = ctx_sampling->cur;
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
// Apply params.logit_bias map
|
||||||
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur.clear();
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
|
|
||||||
|
if (ctx_cfg) {
|
||||||
|
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply penalties
|
||||||
|
if (!prev.empty()) {
|
||||||
|
const float nl_logit = logits[llama_token_nl(ctx_main)];
|
||||||
|
const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
|
||||||
|
|
||||||
|
llama_sample_repetition_penalty(ctx_main, &cur_p,
|
||||||
|
prev.data() + prev.size() - last_n_repeat,
|
||||||
|
last_n_repeat, repeat_penalty);
|
||||||
|
llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
|
||||||
|
prev.data() + prev.size() - last_n_repeat,
|
||||||
|
last_n_repeat, alpha_frequency, alpha_presence);
|
||||||
|
|
||||||
|
if (!penalize_nl) {
|
||||||
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
|
||||||
|
cur_p.data[idx].logit = nl_logit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL) {
|
||||||
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// Greedy sampling
|
||||||
|
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// Temperature sampling
|
||||||
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
|
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
||||||
|
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||||
|
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||||
|
|
||||||
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
|
||||||
|
//{
|
||||||
|
// const int n_top = 10;
|
||||||
|
// LOG("top %d candidates:\n", n_top);
|
||||||
|
|
||||||
|
// for (int i = 0; i < n_top; i++) {
|
||||||
|
// const llama_token id = cur_p.data[i].id;
|
||||||
|
// (void)id; // To avoid a warning that id is unused when logging is disabled.
|
||||||
|
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
|
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_accept(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
llama_token id) {
|
||||||
|
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||||
|
ctx_sampling->prev.push_back(id);
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL) {
|
||||||
|
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
|
||||||
|
}
|
||||||
|
}
|
99
common/sampling.h
Normal file
99
common/sampling.h
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampling parameters
|
||||||
|
typedef struct llama_sampling_params {
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||||
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
|
||||||
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
|
|
||||||
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
|
||||||
|
// Classifier-Free Guidance
|
||||||
|
// https://arxiv.org/abs/2306.17806
|
||||||
|
std::string cfg_negative_prompt; // string to help guidance
|
||||||
|
float cfg_scale = 1.f; // How strong is guidance
|
||||||
|
|
||||||
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
} llama_sampling_params;
|
||||||
|
|
||||||
|
// general sampler context
|
||||||
|
// TODO: move to llama.h
|
||||||
|
struct llama_sampling_context {
|
||||||
|
// parameters that will be used for sampling
|
||||||
|
llama_sampling_params params;
|
||||||
|
|
||||||
|
// mirostat sampler state
|
||||||
|
float mirostat_mu;
|
||||||
|
|
||||||
|
llama_grammar * grammar;
|
||||||
|
|
||||||
|
// internal
|
||||||
|
grammar_parser::parse_state parsed_grammar;
|
||||||
|
|
||||||
|
// TODO: replace with ring-buffer
|
||||||
|
std::vector<llama_token> prev;
|
||||||
|
std::vector<llama_token_data> cur;
|
||||||
|
};
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
// Create a new sampling context instance.
|
||||||
|
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
|
||||||
|
|
||||||
|
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Reset the sampler context
|
||||||
|
// - clear prev tokens
|
||||||
|
// - reset grammar
|
||||||
|
void llama_sampling_reset(llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Copy the sampler context
|
||||||
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||||
|
|
||||||
|
// this is a common sampling function used across the examples for convenience
|
||||||
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
// llama_sampling_reset when a sequence ends
|
||||||
|
//
|
||||||
|
// required:
|
||||||
|
// - ctx_main: context to use for sampling
|
||||||
|
// - ctx_sampling: sampling-specific context
|
||||||
|
//
|
||||||
|
// optional:
|
||||||
|
// - ctx_cfg: context to use for classifier-free guidance
|
||||||
|
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
||||||
|
//
|
||||||
|
// returns:
|
||||||
|
// - token: sampled token
|
||||||
|
// - candidates: vector of candidate tokens
|
||||||
|
//
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
int idx = 0);
|
||||||
|
|
||||||
|
void llama_sampling_accept(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
llama_token id);
|
8396
common/stb_image.h
Normal file
8396
common/stb_image.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -863,7 +863,7 @@ size_t tokenize_file(
|
||||||
(int) buf.size(),
|
(int) buf.size(),
|
||||||
out_tokens.data(),
|
out_tokens.data(),
|
||||||
(int) out_tokens.size(),
|
(int) out_tokens.size(),
|
||||||
false);
|
false, false);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
out_tokens.resize(-n_tokens);
|
out_tokens.resize(-n_tokens);
|
||||||
n_tokens = llama_tokenize(
|
n_tokens = llama_tokenize(
|
||||||
|
@ -872,7 +872,7 @@ size_t tokenize_file(
|
||||||
(int) buf.size(),
|
(int) buf.size(),
|
||||||
out_tokens.data(),
|
out_tokens.data(),
|
||||||
(int) out_tokens.size(),
|
(int) out_tokens.size(),
|
||||||
false);
|
false, false);
|
||||||
}
|
}
|
||||||
if (n_tokens >= 0) {
|
if (n_tokens >= 0) {
|
||||||
out_tokens.resize(n_tokens);
|
out_tokens.resize(n_tokens);
|
||||||
|
@ -966,7 +966,7 @@ size_t tokenize_file(
|
||||||
(int) buf_sample.size(),
|
(int) buf_sample.size(),
|
||||||
tok_sample.data(),
|
tok_sample.data(),
|
||||||
(int) tok_sample.size(),
|
(int) tok_sample.size(),
|
||||||
false);
|
false, false);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
tok_sample.resize(-n_tokens);
|
tok_sample.resize(-n_tokens);
|
||||||
n_tokens = llama_tokenize(llama_get_model(lctx),
|
n_tokens = llama_tokenize(llama_get_model(lctx),
|
||||||
|
@ -974,7 +974,7 @@ size_t tokenize_file(
|
||||||
(int) buf_sample.size(),
|
(int) buf_sample.size(),
|
||||||
tok_sample.data(),
|
tok_sample.data(),
|
||||||
(int) tok_sample.size(),
|
(int) tok_sample.size(),
|
||||||
false);
|
false, false);
|
||||||
GGML_ASSERT(n_tokens >= 0);
|
GGML_ASSERT(n_tokens >= 0);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(n_tokens <= (int) tok_sample.size());
|
GGML_ASSERT(n_tokens <= (int) tok_sample.size());
|
||||||
|
@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
|
||||||
|
|
||||||
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
||||||
if (impr_plot > 0) impr_plot = 0;
|
if (impr_plot > 0) impr_plot = 0;
|
||||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
|
||||||
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
||||||
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
||||||
*sched, opt->loss_after);
|
*sched, opt->loss_after);
|
||||||
|
|
238
convert-bloom-hf-to-gguf.py
Executable file
238
convert-bloom-hf-to-gguf.py
Executable file
|
@ -0,0 +1,238 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF bloom --> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer # type: ignore[import]
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
# Supported Models:
|
||||||
|
# https://huggingface.co/bigscience/bloom-1b7
|
||||||
|
# https://huggingface.co/bigscience/bloom-3b
|
||||||
|
# https://huggingface.co/bigscience/bloom-7b1
|
||||||
|
# https://huggingface.co/Langboat/bloom-1b4-zh
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
|
||||||
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||||
|
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||||
|
|
||||||
|
print("gguf: loading model "+dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
if hparams["architectures"][0] != "BloomForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
|
||||||
|
ARCH=gguf.MODEL_ARCH.BLOOM
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
block_count = hparams["n_layer"]
|
||||||
|
|
||||||
|
gguf_writer.add_name("Bloom")
|
||||||
|
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
|
||||||
|
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
|
||||||
|
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
|
||||||
|
gguf_writer.add_embedding_length(n_embed)
|
||||||
|
gguf_writer.add_feed_forward_length(4 * n_embed)
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_head_count(n_head)
|
||||||
|
gguf_writer.add_head_count_kv(n_head)
|
||||||
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
|
gguf_writer.add_file_type(ftype)
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
# gpt2 tokenizer
|
||||||
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
|
scores.append(0.0) # dummy
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
||||||
|
|
||||||
|
# params for qkv transform
|
||||||
|
n_head_kv = hparams.get("n_head_kv", n_head)
|
||||||
|
head_dim = n_embed // n_head
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||||
|
|
||||||
|
has_lm_head = True
|
||||||
|
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
|
||||||
|
has_lm_head = False
|
||||||
|
|
||||||
|
for original_name in model_part.keys():
|
||||||
|
data = model_part[original_name]
|
||||||
|
name = re.sub(r'transformer\.', '', original_name)
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
||||||
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||||
|
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||||
|
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||||
|
data = np.concatenate(
|
||||||
|
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.weight")
|
||||||
|
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
||||||
|
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
||||||
|
data = np.concatenate(
|
||||||
|
(qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 2, :].reshape((n_embed,))),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.bias")
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if not has_lm_head and name == "word_embeddings.weight":
|
||||||
|
gguf_writer.add_tensor("output.weight", data)
|
||||||
|
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
|
||||||
|
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
hparams = json.load(f)
|
hparams = json.load(f)
|
||||||
|
|
||||||
if hparams["architectures"][0] != "FalconForCausalLM":
|
if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
block_count = hparams.get("num_hidden_layers")
|
||||||
|
if block_count is None:
|
||||||
|
block_count = hparams["n_layer"] # old name
|
||||||
|
|
||||||
|
n_head = hparams.get("num_attention_heads")
|
||||||
|
if n_head is None:
|
||||||
|
n_head = hparams["n_head"] # old name
|
||||||
|
|
||||||
|
n_head_kv = hparams.get("num_kv_heads")
|
||||||
|
if n_head_kv is None:
|
||||||
|
n_head_kv = hparams.get("n_head_kv", 1) # old name
|
||||||
|
|
||||||
gguf_writer.add_name("Falcon")
|
gguf_writer.add_name("Falcon")
|
||||||
gguf_writer.add_context_length(2048) # not in config.json
|
gguf_writer.add_context_length(2048) # not in config.json
|
||||||
|
@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
gguf_writer.add_head_count(n_head)
|
||||||
if "num_kv_heads" in hparams:
|
gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
|
||||||
else:
|
|
||||||
gguf_writer.add_head_count_kv(1)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
gguf_writer.add_file_type(ftype)
|
gguf_writer.add_file_type(ftype)
|
||||||
|
|
||||||
|
@ -152,10 +159,6 @@ special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
# params for qkv transform
|
|
||||||
n_head = hparams["num_attention_heads"]
|
|
||||||
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
|
||||||
|
|
||||||
head_dim = hparams["hidden_size"] // n_head
|
head_dim = hparams["hidden_size"] // n_head
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
|
|
218
convert-mpt-hf-to-gguf.py
Executable file
218
convert-mpt-hf-to-gguf.py
Executable file
|
@ -0,0 +1,218 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF mpt--> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer # type: ignore[import]
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-only", action="store_true",
|
||||||
|
help="extract only the vocab",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile", type=Path,
|
||||||
|
help="path to write to; default: based on input",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"model", type=Path,
|
||||||
|
help="directory containing model file, or model file itself (*.bin)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||||
|
help="output format - use 0 for float32, 1 for float16",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||||
|
|
||||||
|
print("gguf: loading model "+dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
if hparams["architectures"][0] != "MPTForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
|
||||||
|
ARCH=gguf.MODEL_ARCH.MPT
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
block_count = hparams["n_layers"]
|
||||||
|
|
||||||
|
gguf_writer.add_name(dir_model.name)
|
||||||
|
gguf_writer.add_context_length(hparams["max_seq_len"])
|
||||||
|
gguf_writer.add_embedding_length(hparams["d_model"])
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
|
||||||
|
gguf_writer.add_head_count(hparams["n_heads"])
|
||||||
|
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
|
||||||
|
gguf_writer.add_head_count_kv(kv_n_heads)
|
||||||
|
gguf_writer.add_layer_norm_eps(1e-05)
|
||||||
|
if hparams["attn_config"]["clip_qkv"] is not None:
|
||||||
|
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
|
||||||
|
gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
# gpt2 tokenizer
|
||||||
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
|
||||||
|
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
|
||||||
|
# accomodate some "reserved" tokens; this is causing problems down the line in
|
||||||
|
# llama.cpp, so we pad the vocab with dummy tokens:
|
||||||
|
|
||||||
|
vocab_size = hparams["vocab_size"]
|
||||||
|
|
||||||
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
|
scores.append(0.0) # dummy
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||||
|
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part[name]
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Cannot map tensor '" + name + "'")
|
||||||
|
continue # for the sake of compatibility with some old published models, don't quit
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
# note: MPT output is tied to (same as) wte in original model;
|
||||||
|
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
||||||
|
if new_name == "token_embd.weight":
|
||||||
|
gguf_writer.add_tensor("output.weight", data)
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -17,33 +17,6 @@ if "NO_LOCAL_GGUF" not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def bytes_to_unicode():
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
|
||||||
"""
|
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
|
||||||
The reversible bpe codes work on unicode strings.
|
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
|
||||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
|
||||||
bs = (
|
|
||||||
list(range(ord("!"), ord("~") + 1))
|
|
||||||
+ list(range(ord("¡"), ord("¬") + 1))
|
|
||||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
|
||||||
)
|
|
||||||
cs = bs[:]
|
|
||||||
n = 0
|
|
||||||
for b in range(2**8):
|
|
||||||
if b not in bs:
|
|
||||||
bs.append(b)
|
|
||||||
cs.append(2**8 + n)
|
|
||||||
n += 1
|
|
||||||
return dict(zip(bs, (chr(n) for n in cs)))
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
num_parts = 0
|
num_parts = 0
|
||||||
for filename in os.listdir(dir_model):
|
for filename in os.listdir(dir_model):
|
||||||
|
@ -153,53 +126,25 @@ tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
scores: list[float] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / "tokenizer.json"
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
vocab_size = (
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
hparams["vocab_size"]
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
if "vocab_size" in hparams
|
|
||||||
else len(tokenizer_json["model"]["vocab"])
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i in reverse_vocab:
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
text = reverse_vocab[i]
|
scores.append(0.0) # dummy
|
||||||
try:
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
|
||||||
except KeyError:
|
|
||||||
text = bytearray()
|
|
||||||
for c in reverse_vocab[i]:
|
|
||||||
if ord(c) < 256: # single byte character
|
|
||||||
text.append(byte_decoder[ord(c)])
|
|
||||||
else: # multibyte special token character
|
|
||||||
text.extend(c.encode("utf-8"))
|
|
||||||
else:
|
|
||||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
|
||||||
pad_token = f"[PAD{i}]".encode("utf8")
|
|
||||||
text = bytearray(pad_token)
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(0.0) # dymmy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
|
|
|
@ -49,7 +49,7 @@ According to the BLIS documentation, we could set the following
|
||||||
environment variables to modify the behavior of openmp:
|
environment variables to modify the behavior of openmp:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export GOMP_GPU_AFFINITY="0-19"
|
export GOMP_CPU_AFFINITY="0-19"
|
||||||
export BLIS_NUM_THREADS=14
|
export BLIS_NUM_THREADS=14
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -25,9 +25,11 @@ else()
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
|
add_subdirectory(llava)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(beam-search)
|
add_subdirectory(beam-search)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
|
|
5
examples/batched-bench/CMakeLists.txt
Normal file
5
examples/batched-bench/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET batched-bench)
|
||||||
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
51
examples/batched-bench/README.md
Normal file
51
examples/batched-bench/README.md
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# llama.cpp/example/batched-bench
|
||||||
|
|
||||||
|
Benchmark the batched decoding performance of `llama.cpp`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
There are 2 modes of operation:
|
||||||
|
|
||||||
|
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
|
||||||
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||||
|
|
||||||
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
||||||
|
|
||||||
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
||||||
|
|
||||||
|
# custom set of batches
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample results
|
||||||
|
|
||||||
|
- `PP` - prompt tokens per batch
|
||||||
|
- `TG` - generated tokens per batch
|
||||||
|
- `B` - number of batches
|
||||||
|
- `N_KV` - required KV cache size
|
||||||
|
- `T_PP` - prompt processing time (i.e. time to first token)
|
||||||
|
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
|
||||||
|
- `T_TG` - time to generate all batches
|
||||||
|
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
|
||||||
|
- `T` - total time
|
||||||
|
- `S` - total speed (i.e. all tokens / total time)
|
||||||
|
|
||||||
|
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||||
|
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||||
|
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
|
||||||
|
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
|
||||||
|
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
|
||||||
|
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
|
||||||
|
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
|
||||||
|
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
|
||||||
|
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
|
||||||
|
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
|
||||||
|
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
|
||||||
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
243
examples/batched-bench/batched-bench.cpp
Normal file
243
examples/batched-bench/batched-bench.cpp
Normal file
|
@ -0,0 +1,243 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// mutates the input string
|
||||||
|
static std::vector<int> parse_list(char * p) {
|
||||||
|
std::vector<int> ret;
|
||||||
|
|
||||||
|
char * q = p;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
if (*p == ',') {
|
||||||
|
*p = '\0';
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
q = p + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (argc == 1 || argv[1][0] == '-') {
|
||||||
|
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
|
||||||
|
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||||
|
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_kv_max = 2048;
|
||||||
|
int is_pp_shared = 0;
|
||||||
|
int n_gpu_layers = 0;
|
||||||
|
int mmq = 0;
|
||||||
|
|
||||||
|
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
||||||
|
std::vector<int> n_tg = { 128, 256, };
|
||||||
|
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
||||||
|
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
||||||
|
|
||||||
|
if (argc >= 2) {
|
||||||
|
params.model = argv[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 3) {
|
||||||
|
n_kv_max = std::atoi(argv[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 4) {
|
||||||
|
is_pp_shared = std::atoi(argv[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 5) {
|
||||||
|
n_gpu_layers = std::atoi(argv[4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 6) {
|
||||||
|
mmq = std::atoi(argv[5]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 7) {
|
||||||
|
n_pp = parse_list(argv[6]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 8) {
|
||||||
|
n_tg = parse_list(argv[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 9) {
|
||||||
|
n_pl = parse_list(argv[8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
// initialize the model
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
|
model_params.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
|
ctx_params.seed = 1234;
|
||||||
|
ctx_params.n_ctx = n_kv_max;
|
||||||
|
ctx_params.n_batch = 512;
|
||||||
|
ctx_params.mul_mat_q = mmq;
|
||||||
|
|
||||||
|
ctx_params.n_threads = params.n_threads;
|
||||||
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||||
|
|
||||||
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
|
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||||
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
|
llama_batch batch_view = {
|
||||||
|
n_tokens,
|
||||||
|
batch.token + i,
|
||||||
|
nullptr,
|
||||||
|
batch.pos + i,
|
||||||
|
batch.n_seq_id + i,
|
||||||
|
batch.seq_id + i,
|
||||||
|
batch.logits + i,
|
||||||
|
0, 0, 0, // unused
|
||||||
|
};
|
||||||
|
|
||||||
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
// warm up
|
||||||
|
{
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
llama_batch_add(batch, 0, i, { 0 }, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
|
||||||
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
|
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
|
||||||
|
const int pp = n_pp[i_pp];
|
||||||
|
const int tg = n_tg[i_tg];
|
||||||
|
const int pl = n_pl[i_pl];
|
||||||
|
|
||||||
|
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
|
||||||
|
|
||||||
|
if (n_ctx_req > n_kv_max) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
const int n_tokens = is_pp_shared ? pp : pl*pp;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
llama_batch_add(batch, 0, i, { 0 }, false);
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
|
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_pp_shared) {
|
||||||
|
for (int32_t i = 1; i < pl; ++i) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_pp_end = ggml_time_us();
|
||||||
|
|
||||||
|
const auto t_tg_start = ggml_time_us();
|
||||||
|
|
||||||
|
for (int i = 0; i < tg; ++i) {
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
for (int j = 0; j < pl; ++j) {
|
||||||
|
llama_batch_add(batch, 0, pp + i, { j }, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_tg_end = ggml_time_us();
|
||||||
|
|
||||||
|
const int32_t n_kv = n_ctx_req;
|
||||||
|
|
||||||
|
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
|
||||||
|
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
|
||||||
|
const float t = t_pp + t_tg;
|
||||||
|
|
||||||
|
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
|
||||||
|
const float speed_tg = pl*tg / t_tg;
|
||||||
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
9
examples/batched.swift/.gitignore
vendored
Normal file
9
examples/batched.swift/.gitignore
vendored
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
.DS_Store
|
||||||
|
/.build
|
||||||
|
/Packages
|
||||||
|
xcuserdata/
|
||||||
|
DerivedData/
|
||||||
|
.swiftpm/configuration/registries.json
|
||||||
|
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||||
|
.netrc
|
||||||
|
batched_swift
|
6
examples/batched.swift/Makefile
Executable file
6
examples/batched.swift/Makefile
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
.PHONY: build
|
||||||
|
|
||||||
|
build:
|
||||||
|
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
|
rm -f ./batched_swift
|
||||||
|
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
22
examples/batched.swift/Package.swift
Normal file
22
examples/batched.swift/Package.swift
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
// swift-tools-version: 5.5
|
||||||
|
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||||
|
|
||||||
|
import PackageDescription
|
||||||
|
|
||||||
|
let package = Package(
|
||||||
|
name: "batched_swift",
|
||||||
|
platforms: [.macOS(.v12)],
|
||||||
|
dependencies: [
|
||||||
|
.package(name: "llama", path: "../../"),
|
||||||
|
],
|
||||||
|
targets: [
|
||||||
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
|
.executableTarget(
|
||||||
|
name: "batched_swift",
|
||||||
|
dependencies: ["llama"],
|
||||||
|
path: "Sources",
|
||||||
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
4
examples/batched.swift/README.md
Normal file
4
examples/batched.swift/README.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
|
$ `make`
|
||||||
|
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
263
examples/batched.swift/Sources/main.swift
Normal file
263
examples/batched.swift/Sources/main.swift
Normal file
|
@ -0,0 +1,263 @@
|
||||||
|
import Foundation
|
||||||
|
import llama
|
||||||
|
|
||||||
|
let arguments = CommandLine.arguments
|
||||||
|
|
||||||
|
// Check that we have at least one argument (the model path)
|
||||||
|
guard arguments.count > 1 else {
|
||||||
|
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
let modelPath: String = arguments[1]
|
||||||
|
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
|
||||||
|
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
|
||||||
|
|
||||||
|
// total length of the sequences including the prompt
|
||||||
|
let n_len: Int = 32
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
llama_backend_init(false)
|
||||||
|
defer {
|
||||||
|
llama_backend_free()
|
||||||
|
}
|
||||||
|
|
||||||
|
let model_params = llama_model_default_params()
|
||||||
|
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
|
||||||
|
print("Failed to load model")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free_model(model)
|
||||||
|
}
|
||||||
|
|
||||||
|
var tokens = tokenize(text: prompt, add_bos: true)
|
||||||
|
|
||||||
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
|
var context_params = llama_context_default_params()
|
||||||
|
context_params.seed = 1234
|
||||||
|
context_params.n_ctx = n_kv_req
|
||||||
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
|
context_params.n_threads = 8
|
||||||
|
context_params.n_threads_batch = 8
|
||||||
|
|
||||||
|
let context = llama_new_context_with_model(model, context_params)
|
||||||
|
guard context != nil else {
|
||||||
|
print("Failed to initialize context")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free(context)
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
|
|
||||||
|
if n_kv_req > n_ctx {
|
||||||
|
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buffer: [CChar] = []
|
||||||
|
for id: llama_token in tokens {
|
||||||
|
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
|
||||||
|
defer {
|
||||||
|
llama_batch_free(batch)
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate the initial prompt
|
||||||
|
batch.n_tokens = Int32(tokens.count)
|
||||||
|
|
||||||
|
for (i, token) in tokens.enumerated() {
|
||||||
|
batch.token[i] = token
|
||||||
|
batch.pos[i] = Int32(i)
|
||||||
|
batch.n_seq_id[i] = 1
|
||||||
|
// batch.seq_id[i][0] = 0
|
||||||
|
// TODO: is this the proper way to do this?
|
||||||
|
if let seq_id = batch.seq_id[i] {
|
||||||
|
seq_id[0] = 0
|
||||||
|
}
|
||||||
|
batch.logits[i] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
batch.logits[Int(batch.n_tokens) - 1] = 1
|
||||||
|
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 1 ..< n_parallel {
|
||||||
|
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("generating \(n_parallel) sequences ...\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
var streams: [String] = .init(repeating: "", count: n_parallel)
|
||||||
|
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
|
||||||
|
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
|
||||||
|
|
||||||
|
var n_cur = batch.n_tokens
|
||||||
|
var n_decode = 0
|
||||||
|
|
||||||
|
let t_main_start = ggml_time_us()
|
||||||
|
|
||||||
|
while n_cur <= n_len {
|
||||||
|
// prepare the next batch
|
||||||
|
batch.n_tokens = 0
|
||||||
|
|
||||||
|
// sample the next token for each parallel sequence / stream
|
||||||
|
for i in 0 ..< n_parallel {
|
||||||
|
if i_batch[i] < 0 {
|
||||||
|
// the stream has already finished
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var n_vocab = llama_n_vocab(model)
|
||||||
|
var logits = llama_get_logits_ith(context, i_batch[i])
|
||||||
|
|
||||||
|
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
||||||
|
|
||||||
|
for token_id in 0 ..< n_vocab {
|
||||||
|
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
var candidates_p: llama_token_data_array = .init(
|
||||||
|
data: &candidates,
|
||||||
|
size: candidates.count,
|
||||||
|
sorted: false
|
||||||
|
)
|
||||||
|
|
||||||
|
let top_k: Int32 = 40
|
||||||
|
let top_p: Float = 0.9
|
||||||
|
let temp: Float = 0.4
|
||||||
|
|
||||||
|
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
||||||
|
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
||||||
|
llama_sample_temp(context, &candidates_p, temp)
|
||||||
|
|
||||||
|
let new_token_id = llama_sample_token(context, &candidates_p)
|
||||||
|
|
||||||
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
|
// is it an end of stream? -> mark the stream as finished
|
||||||
|
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||||
|
i_batch[i] = -1
|
||||||
|
// print("")
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("stream \(i) finished at n_cur = \(n_cur)")
|
||||||
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
|
||||||
|
|
||||||
|
// if there is only one stream, we print immediately to stdout
|
||||||
|
if n_parallel == 1 {
|
||||||
|
print(nextStringPiece, terminator: "")
|
||||||
|
}
|
||||||
|
streams[i] += nextStringPiece
|
||||||
|
|
||||||
|
// push this new token for next evaluation
|
||||||
|
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||||
|
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||||
|
batch.n_seq_id[Int(batch.n_tokens)] = 1
|
||||||
|
if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
|
||||||
|
seq_id[0] = Int32(i)
|
||||||
|
}
|
||||||
|
batch.logits[Int(batch.n_tokens)] = 1
|
||||||
|
|
||||||
|
i_batch[i] = batch.n_tokens
|
||||||
|
|
||||||
|
batch.n_tokens += 1
|
||||||
|
|
||||||
|
n_decode += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// all streams are finished
|
||||||
|
if batch.n_tokens == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
n_cur += 1
|
||||||
|
|
||||||
|
// evaluate the current batch with the transformer model
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("\n")
|
||||||
|
for (i, stream) in streams.enumerated() {
|
||||||
|
print("sequence \(i):\n\n\(prompt)\(stream)\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
||||||
|
|
||||||
|
llama_print_timings(context)
|
||||||
|
|
||||||
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
|
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||||
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
|
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||||
|
var swiftTokens: [llama_token] = []
|
||||||
|
for i in 0 ..< tokenCount {
|
||||||
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
}
|
||||||
|
tokens.deallocate()
|
||||||
|
return swiftTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
|
||||||
|
var result = [CChar](repeating: 0, count: 8)
|
||||||
|
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
||||||
|
if nTokens < 0 {
|
||||||
|
if result.count >= -Int(nTokens) {
|
||||||
|
result.removeLast(-Int(nTokens))
|
||||||
|
} else {
|
||||||
|
result.removeAll()
|
||||||
|
}
|
||||||
|
let check = llama_token_to_piece(
|
||||||
|
model,
|
||||||
|
token,
|
||||||
|
&result,
|
||||||
|
Int32(result.count)
|
||||||
|
)
|
||||||
|
assert(check == nTokens)
|
||||||
|
} else {
|
||||||
|
result.removeLast(result.count - Int(nTokens))
|
||||||
|
}
|
||||||
|
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
|
||||||
|
return utfString
|
||||||
|
} else {
|
||||||
|
buffer.append(contentsOf: result)
|
||||||
|
let data = Data(buffer.map { UInt8(bitPattern: $0) })
|
||||||
|
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
|
||||||
|
buffer = []
|
||||||
|
}
|
||||||
|
guard let bufferString = String(data: data, encoding: .utf8) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
buffer = []
|
||||||
|
return bufferString
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.seed = 1234;
|
ctx_params.seed = 1234;
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||||
ctx_params.n_threads = params.n_threads;
|
ctx_params.n_threads = params.n_threads;
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
@ -97,20 +97,15 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
// create a llama_batch with size 512
|
// create a llama_batch
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
|
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
|
||||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
|
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
batch.n_tokens = tokens_list.size();
|
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||||
|
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
||||||
batch.token[i] = tokens_list[i];
|
|
||||||
batch.pos[i] = i;
|
|
||||||
batch.seq_id[i] = 0;
|
|
||||||
batch.logits[i] = false;
|
|
||||||
}
|
}
|
||||||
|
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
@ -146,7 +141,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
while (n_cur <= n_len) {
|
while (n_cur <= n_len) {
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
batch.n_tokens = 0;
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// sample the next token for each parallel sequence / stream
|
// sample the next token for each parallel sequence / stream
|
||||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
|
@ -198,15 +193,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
|
||||||
batch.token [batch.n_tokens] = new_token_id;
|
|
||||||
batch.pos [batch.n_tokens] = n_cur;
|
|
||||||
batch.seq_id[batch.n_tokens] = i;
|
|
||||||
batch.logits[batch.n_tokens] = true;
|
|
||||||
|
|
||||||
i_batch[i] = batch.n_tokens;
|
i_batch[i] = batch.n_tokens;
|
||||||
|
|
||||||
batch.n_tokens += 1;
|
// push this new token for next evaluation
|
||||||
|
llama_batch_add(batch, new_token_id, n_cur, { i }, true);
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,7 +79,7 @@ bool eval_float(void * model, float * input, int N){
|
||||||
if (n_eval > n_batch) {
|
if (n_eval > n_batch) {
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
|
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
||||||
llama_token sampling_id(struct MyModel* mymodel) {
|
llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
llama_context* ctx = mymodel->ctx;
|
llama_context* ctx = mymodel->ctx;
|
||||||
gpt_params params = mymodel->params;
|
gpt_params params = mymodel->params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
// int n_ctx = llama_n_ctx(ctx);
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = sparams.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
|
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = sparams.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = sparams.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = sparams.typical_p;
|
||||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
// const float repeat_penalty = params.repeat_penalty;
|
// const float repeat_penalty = params.repeat_penalty;
|
||||||
// const float alpha_presence = params.presence_penalty;
|
// const float alpha_presence = params.presence_penalty;
|
||||||
// const float alpha_frequency = params.frequency_penalty;
|
// const float alpha_frequency = params.frequency_penalty;
|
||||||
const int mirostat = params.mirostat;
|
const int mirostat = sparams.mirostat;
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
const float mirostat_tau = sparams.mirostat_tau;
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = sparams.mirostat_eta;
|
||||||
// const bool penalize_nl = params.penalize_nl;
|
// const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
// Apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
|
||||||
set_param_lora(lora);
|
set_param_lora(lora);
|
||||||
|
|
||||||
// measure data size
|
// measure data size
|
||||||
struct ggml_allocr * alloc = NULL;
|
size_t size = 0;
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
alloc_lora(alloc, lora);
|
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
|
||||||
|
}
|
||||||
|
|
||||||
// allocate data
|
// allocate data
|
||||||
lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
|
struct ggml_allocr * alloc = NULL;
|
||||||
ggml_allocr_free(alloc);
|
lora->data.resize(size + tensor_alignment);
|
||||||
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
|
||||||
alloc_lora(alloc, lora);
|
alloc_lora(alloc, lora);
|
||||||
ggml_allocr_free(alloc);
|
ggml_allocr_free(alloc);
|
||||||
|
@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
||||||
|
|
||||||
// measure required memory for input tensors
|
// measure required memory for input tensors
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
|
||||||
ggml_allocr_alloc(alloc, tokens_input);
|
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
|
||||||
ggml_allocr_alloc(alloc, target_probs);
|
tensor_alignment;
|
||||||
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
||||||
|
|
||||||
// allocate input tensors
|
// allocate input tensors
|
||||||
|
|
|
@ -104,6 +104,7 @@ static void sigint_handler(int signo) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -233,10 +234,22 @@ int main(int argc, char ** argv) {
|
||||||
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
|
bool suff_rm_leading_spc = params.escape;
|
||||||
|
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||||
|
params.input_suffix.erase(0, 1);
|
||||||
|
suff_rm_leading_spc = false;
|
||||||
|
}
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
|
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
|
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||||
|
const int space_token = 29871;
|
||||||
|
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||||
|
inp_sfx.erase(inp_sfx.begin());
|
||||||
|
}
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||||
|
if (add_bos) {
|
||||||
|
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||||
|
}
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
|
@ -244,12 +257,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
embd_inp.push_back(llama_token_bos(ctx));
|
embd_inp.push_back(llama_token_bos(ctx));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
|
@ -257,13 +270,13 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
|
@ -281,8 +294,8 @@ int main(int argc, char ** argv) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
|
@ -300,7 +313,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -346,7 +359,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -364,8 +377,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -375,9 +388,6 @@ int main(int argc, char ** argv) {
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: replace with ring-buffer
|
|
||||||
std::vector<llama_token> last_tokens(n_ctx);
|
|
||||||
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
|
||||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||||
if (params.infill) {
|
if (params.infill) {
|
||||||
printf("\n************\n");
|
printf("\n************\n");
|
||||||
|
@ -420,10 +430,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> embd_guidance;
|
std::vector<llama_token> embd_guidance;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
while (n_remain != 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
|
@ -470,7 +477,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -498,7 +505,7 @@ int main(int argc, char ** argv) {
|
||||||
input_buf = embd_guidance.data();
|
input_buf = embd_guidance.data();
|
||||||
input_size = embd_guidance.size();
|
input_size = embd_guidance.size();
|
||||||
|
|
||||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
|
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
||||||
} else {
|
} else {
|
||||||
input_buf = embd.data();
|
input_buf = embd.data();
|
||||||
input_size = embd.size();
|
input_size = embd.size();
|
||||||
|
@ -521,7 +528,7 @@ int main(int argc, char ** argv) {
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_TEE("%s : failed to eval\n", __func__);
|
||||||
|
@ -540,12 +547,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||||
last_tokens.push_back(id);
|
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
|
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
|
@ -561,8 +567,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
last_tokens.erase(last_tokens.begin());
|
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||||
last_tokens.push_back(embd_inp[n_consumed]);
|
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
break;
|
break;
|
||||||
|
@ -594,7 +600,7 @@ int main(int argc, char ** argv) {
|
||||||
if ((int) embd_inp.size() <= n_consumed) {
|
if ((int) embd_inp.size() <= n_consumed) {
|
||||||
|
|
||||||
// deal with eot token in infill mode
|
// deal with eot token in infill mode
|
||||||
if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
||||||
if(is_interacting && !params.interactive_first) {
|
if(is_interacting && !params.interactive_first) {
|
||||||
// print an eot token
|
// print an eot token
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||||
|
@ -627,10 +633,27 @@ int main(int argc, char ** argv) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
// done taking input, reset color
|
// done taking input, reset color
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
|
|
||||||
|
if (params.escape) {
|
||||||
|
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||||
|
process_escapes(params.input_prefix);
|
||||||
|
process_escapes(params.input_suffix);
|
||||||
|
}
|
||||||
|
suff_rm_leading_spc = params.escape;
|
||||||
|
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||||
|
params.input_suffix.erase(0, 1);
|
||||||
|
suff_rm_leading_spc = false;
|
||||||
|
}
|
||||||
// tokenize new prefix and suffix
|
// tokenize new prefix and suffix
|
||||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
|
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
|
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||||
|
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||||
|
inp_sfx.erase(inp_sfx.begin());
|
||||||
|
}
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||||
|
if (add_bos) {
|
||||||
|
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||||
|
}
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
|
@ -644,7 +667,7 @@ int main(int argc, char ** argv) {
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
// deal with end of text token in interactive mode
|
// deal with end of text token in interactive mode
|
||||||
else if (last_tokens.back() == llama_token_eos(ctx)) {
|
else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||||
LOG("found EOS token\n");
|
LOG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
@ -696,7 +719,7 @@ int main(int argc, char ** argv) {
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
|
|
20
examples/llava/CMakeLists.txt
Normal file
20
examples/llava/CMakeLists.txt
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
set(TARGET clip)
|
||||||
|
add_library(${TARGET} clip.cpp clip.h)
|
||||||
|
install(TARGETS ${TARGET} LIBRARY)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if (NOT MSVC)
|
||||||
|
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
|
endif()
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(TARGET llava)
|
||||||
|
add_executable(${TARGET} llava.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
57
examples/llava/README.md
Normal file
57
examples/llava/README.md
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
# LLaVA
|
||||||
|
|
||||||
|
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
|
||||||
|
|
||||||
|
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
|
||||||
|
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
|
||||||
|
models are available.
|
||||||
|
|
||||||
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Build with cmake or run `make llava` to build it.
|
||||||
|
|
||||||
|
After building, run: `./llava` to see the usage. For example:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||||
|
```
|
||||||
|
|
||||||
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
|
|
||||||
|
## Model conversion
|
||||||
|
|
||||||
|
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
||||||
|
|
||||||
|
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python ./convert.py ../llava-v1.5-7b
|
||||||
|
```
|
||||||
|
|
||||||
|
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- [ ] Support server mode.
|
||||||
|
- [ ] Support non-CPU backend for the image encoding part.
|
||||||
|
- [ ] Support different sampling methods.
|
||||||
|
- [ ] Support more model variants.
|
1064
examples/llava/clip.cpp
Normal file
1064
examples/llava/clip.cpp
Normal file
File diff suppressed because it is too large
Load diff
73
examples/llava/clip.h
Normal file
73
examples/llava/clip.h
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
#ifndef CLIP_H
|
||||||
|
#define CLIP_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
struct clip_ctx;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct clip_vision_hparams {
|
||||||
|
int32_t image_size;
|
||||||
|
int32_t patch_size;
|
||||||
|
int32_t hidden_size;
|
||||||
|
int32_t n_intermediate;
|
||||||
|
int32_t projection_dim;
|
||||||
|
int32_t n_head;
|
||||||
|
int32_t n_layer;
|
||||||
|
float eps;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
||||||
|
|
||||||
|
void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
size_t clip_embd_nbytes(struct clip_ctx * ctx);
|
||||||
|
int clip_n_patches(struct clip_ctx * ctx);
|
||||||
|
int clip_n_mmproj_embd(struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
// RGB uint8 image
|
||||||
|
struct clip_image_u8 {
|
||||||
|
int nx;
|
||||||
|
int ny;
|
||||||
|
uint8_t * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RGB float32 image (NHWC)
|
||||||
|
// Memory layout: RGBRGBRGB...
|
||||||
|
struct clip_image_f32 {
|
||||||
|
int nx;
|
||||||
|
int ny;
|
||||||
|
float * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_image_u8_batch {
|
||||||
|
struct clip_image_u8 * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_image_f32_batch {
|
||||||
|
struct clip_image_f32 * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_image_u8 * make_clip_image_u8();
|
||||||
|
struct clip_image_f32 * make_clip_image_f32();
|
||||||
|
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||||
|
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
|
|
||||||
|
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
|
||||||
|
float * vec);
|
||||||
|
|
||||||
|
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // CLIP_H
|
250
examples/llava/convert-image-encoder-to-gguf.py
Normal file
250
examples/llava/convert-image-encoder-to-gguf.py
Normal file
|
@ -0,0 +1,250 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from gguf import *
|
||||||
|
from transformers import CLIPModel, CLIPProcessor
|
||||||
|
|
||||||
|
TEXT = "clip.text"
|
||||||
|
VISION = "clip.vision"
|
||||||
|
|
||||||
|
|
||||||
|
def k(raw_key: str, arch: str) -> str:
|
||||||
|
return raw_key.format(arch=arch)
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
|
||||||
|
if name in (
|
||||||
|
"logit_scale",
|
||||||
|
"text_model.embeddings.position_ids",
|
||||||
|
"vision_model.embeddings.position_ids",
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name.startswith("v") and not has_vision:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name.startswith("t") and not has_text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_name(name: str) -> str:
|
||||||
|
if "projection" in name:
|
||||||
|
return name
|
||||||
|
|
||||||
|
if "mm_projector" in name:
|
||||||
|
return name.replace("model.mm_projector", "mm")
|
||||||
|
|
||||||
|
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||||
|
|
||||||
|
|
||||||
|
def bytes_to_unicode():
|
||||||
|
"""
|
||||||
|
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||||
|
The reversible bpe codes work on unicode strings.
|
||||||
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
|
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||||
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
"""
|
||||||
|
bs = (
|
||||||
|
list(range(ord("!"), ord("~") + 1))
|
||||||
|
+ list(range(ord("¡"), ord("¬") + 1))
|
||||||
|
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||||
|
)
|
||||||
|
cs = bs[:]
|
||||||
|
n = 0
|
||||||
|
for b in range(2**8):
|
||||||
|
if b not in bs:
|
||||||
|
bs.append(b)
|
||||||
|
cs.append(2**8 + n)
|
||||||
|
n += 1
|
||||||
|
cs = [chr(n) for n in cs]
|
||||||
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
|
||||||
|
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||||
|
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||||
|
ap.add_argument("--text-only", action="store_true", required=False,
|
||||||
|
help="Save a text-only model. It can't be used to encode images")
|
||||||
|
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||||
|
help="Save a vision-only model. It can't be used to encode texts")
|
||||||
|
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||||
|
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||||
|
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||||
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
|
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if args.text_only and args.vision_only:
|
||||||
|
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
if args.use_f32:
|
||||||
|
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||||
|
|
||||||
|
# output in the same directory as the model if output_dir is None
|
||||||
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
|
||||||
|
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
tokens = [key for key in vocab]
|
||||||
|
|
||||||
|
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
v_hparams = config["vision_config"]
|
||||||
|
t_hparams = config["text_config"]
|
||||||
|
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
ftype = 1
|
||||||
|
if args.use_f32:
|
||||||
|
ftype = 0
|
||||||
|
|
||||||
|
|
||||||
|
model = CLIPModel.from_pretrained(dir_model)
|
||||||
|
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
fname_middle = None
|
||||||
|
has_text_encoder = True
|
||||||
|
has_vision_encoder = True
|
||||||
|
has_llava_projector = False
|
||||||
|
if args.text_only:
|
||||||
|
fname_middle = "text-"
|
||||||
|
has_vision_encoder = False
|
||||||
|
elif args.vision_only:
|
||||||
|
fname_middle = "vision-"
|
||||||
|
has_text_encoder = False
|
||||||
|
elif args.llava_projector is not None:
|
||||||
|
fname_middle = "mmproj-"
|
||||||
|
has_text_encoder = False
|
||||||
|
has_llava_projector = True
|
||||||
|
else:
|
||||||
|
fname_middle = ""
|
||||||
|
|
||||||
|
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||||
|
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||||
|
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||||
|
|
||||||
|
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||||
|
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||||
|
fout.add_bool("clip.has_llava_projector", has_llava_projector)
|
||||||
|
fout.add_file_type(ftype)
|
||||||
|
model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
|
||||||
|
fout.add_name(model_name)
|
||||||
|
if args.text_only:
|
||||||
|
fout.add_description("text-only CLIP model")
|
||||||
|
elif args.vision_only and not has_llava_projector:
|
||||||
|
fout.add_description("vision-only CLIP model")
|
||||||
|
elif has_llava_projector:
|
||||||
|
fout.add_description("image encoder for LLaVA")
|
||||||
|
else:
|
||||||
|
fout.add_description("two-tower CLIP model")
|
||||||
|
|
||||||
|
if has_text_encoder:
|
||||||
|
# text_model hparams
|
||||||
|
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
||||||
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
||||||
|
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
||||||
|
fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
|
||||||
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
||||||
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
||||||
|
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
||||||
|
fout.add_token_list(tokens)
|
||||||
|
|
||||||
|
if has_vision_encoder:
|
||||||
|
# vision_model hparams
|
||||||
|
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
||||||
|
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
||||||
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
||||||
|
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
||||||
|
fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
|
||||||
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||||
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
||||||
|
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||||
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
|
|
||||||
|
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
|
||||||
|
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
|
||||||
|
fout.add_array("clip.vision.image_mean", image_mean)
|
||||||
|
fout.add_array("clip.vision.image_std", image_std)
|
||||||
|
|
||||||
|
use_gelu = v_hparams["hidden_act"] == "gelu"
|
||||||
|
fout.add_bool("clip.use_gelu", use_gelu)
|
||||||
|
|
||||||
|
|
||||||
|
if has_llava_projector:
|
||||||
|
model.vision_model.encoder.layers.pop(-1)
|
||||||
|
projector = torch.load(args.llava_projector)
|
||||||
|
for name, data in projector.items():
|
||||||
|
name = get_tensor_name(name)
|
||||||
|
if data.ndim == 2:
|
||||||
|
data = data.squeeze().numpy().astype(np.float16)
|
||||||
|
else:
|
||||||
|
data = data.squeeze().numpy().astype(np.float32)
|
||||||
|
|
||||||
|
fout.add_tensor(name, data)
|
||||||
|
|
||||||
|
print("Projector tensors added\n")
|
||||||
|
|
||||||
|
state_dict = model.state_dict()
|
||||||
|
for name, data in state_dict.items():
|
||||||
|
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
|
||||||
|
# we don't need this
|
||||||
|
print(f"skipping parameter: {name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = get_tensor_name(name)
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
|
||||||
|
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||||
|
ftype_cur = 0
|
||||||
|
if n_dims == 4:
|
||||||
|
print(f"tensor {name} is always saved in f16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
elif ftype == 1:
|
||||||
|
if name[-7:] == ".weight" and n_dims == 2:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
else:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
else:
|
||||||
|
if data.dtype != np.float32:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
|
||||||
|
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||||
|
fout.add_tensor(name, data)
|
||||||
|
|
||||||
|
|
||||||
|
fout.write_header_to_file()
|
||||||
|
fout.write_kv_data_to_file()
|
||||||
|
fout.write_tensors_to_file()
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
46
examples/llava/llava-surgery.py
Normal file
46
examples/llava/llava-surgery.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
# find the model part that includes the the multimodal projector weights
|
||||||
|
path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
|
||||||
|
checkpoint = torch.load(path)
|
||||||
|
|
||||||
|
# get a list of mm tensor names
|
||||||
|
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
|
||||||
|
|
||||||
|
# store these tensors in a new dictionary and torch.save them
|
||||||
|
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||||
|
torch.save(projector, f"{args.model}/llava.projector")
|
||||||
|
|
||||||
|
# remove these tensors from the checkpoint and save it again
|
||||||
|
for name in mm_tensors:
|
||||||
|
del checkpoint[name]
|
||||||
|
|
||||||
|
# BakLLaVA models contain CLIP tensors in it
|
||||||
|
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||||
|
if len(clip_tensors) > 0:
|
||||||
|
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||||
|
torch.save(clip, f"{args.model}/llava.clip")
|
||||||
|
|
||||||
|
# remove these tensors
|
||||||
|
for name in clip_tensors:
|
||||||
|
del checkpoint[name]
|
||||||
|
|
||||||
|
# added tokens should be removed to be able to convert Mistral models
|
||||||
|
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||||
|
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||||
|
f.write("{}\n")
|
||||||
|
|
||||||
|
|
||||||
|
torch.save(checkpoint, path)
|
||||||
|
|
||||||
|
print("Done!")
|
||||||
|
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
|
||||||
|
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
145
examples/llava/llava-utils.h
Normal file
145
examples/llava/llava-utils.h
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
|
||||||
|
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
|
int n_eval = N - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||||
|
int N = (int) tokens.size();
|
||||||
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
|
int n_eval = (int) tokens.size() - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
tokens.push_back(id);
|
||||||
|
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
|
std::string str2 = str;
|
||||||
|
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
|
||||||
|
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: use common/sampling.h
|
||||||
|
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||||
|
// out of user input, sample next token
|
||||||
|
const float temp = params.sampling_params.temp;
|
||||||
|
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
|
||||||
|
const float top_p = params.sampling_params.top_p;
|
||||||
|
const float tfs_z = params.sampling_params.tfs_z;
|
||||||
|
const float typical_p = params.sampling_params.typical_p;
|
||||||
|
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
|
||||||
|
// const float repeat_penalty = params.sampling_params.repeat_penalty;
|
||||||
|
// const float alpha_presence = params.sampling_params.presence_penalty;
|
||||||
|
// const float alpha_frequency = params.sampling_params.frequency_penalty;
|
||||||
|
const int mirostat = params.sampling_params.mirostat;
|
||||||
|
const float mirostat_tau = params.sampling_params.mirostat_tau;
|
||||||
|
const float mirostat_eta = params.sampling_params.mirostat_eta;
|
||||||
|
// const bool penalize_nl = params.sampling_params.penalize_nl;
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
{
|
||||||
|
auto logits = llama_get_logits(ctx_llama);
|
||||||
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
|
// Apply params.logit_bias map
|
||||||
|
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
// TODO: Apply penalties
|
||||||
|
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||||
|
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||||
|
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||||
|
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||||
|
// last_n_repeat, repeat_penalty);
|
||||||
|
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||||
|
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||||
|
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||||
|
// if (!penalize_nl) {
|
||||||
|
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// Greedy sampling
|
||||||
|
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// Temperature sampling
|
||||||
|
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
|
||||||
|
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
|
||||||
|
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
|
||||||
|
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
|
||||||
|
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||||
|
id = llama_sample_token(ctx_llama, &candidates_p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
||||||
|
int id = sample_id(ctx_llama, params);
|
||||||
|
static std::string ret;
|
||||||
|
if (id == llama_token_eos(ctx_llama)) {
|
||||||
|
ret = "</s>";
|
||||||
|
} else {
|
||||||
|
ret = llama_token_to_piece(ctx_llama, id);
|
||||||
|
}
|
||||||
|
eval_id(ctx_llama, id, n_past);
|
||||||
|
return ret.c_str();
|
||||||
|
}
|
164
examples/llava/llava.cpp
Normal file
164
examples/llava/llava.cpp
Normal file
|
@ -0,0 +1,164 @@
|
||||||
|
#include "clip.h"
|
||||||
|
#include "llava-utils.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
|
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
|
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mmproj.empty() || params.image.empty()) {
|
||||||
|
gpt_print_usage(argc, argv, params);
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * clip_path = params.mmproj.c_str();
|
||||||
|
const char * img_path = params.image.c_str();
|
||||||
|
|
||||||
|
if (params.prompt.empty()) {
|
||||||
|
params.prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
|
||||||
|
// load and preprocess the image
|
||||||
|
clip_image_u8 img;
|
||||||
|
clip_image_f32 img_res;
|
||||||
|
|
||||||
|
if (!clip_image_load_from_file(img_path, &img)) {
|
||||||
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
|
||||||
|
|
||||||
|
clip_free(ctx_clip);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
|
||||||
|
fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
|
||||||
|
|
||||||
|
clip_free(ctx_clip);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_img_pos = clip_n_patches(ctx_clip);
|
||||||
|
int n_img_embd = clip_n_mmproj_embd(ctx_clip);
|
||||||
|
|
||||||
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||||
|
|
||||||
|
if (!image_embd) {
|
||||||
|
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t t_img_enc_start_us = ggml_time_us();
|
||||||
|
if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
|
||||||
|
fprintf(stderr, "Unable to encode image\n");
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
|
|
||||||
|
// we get the embeddings, free up the memory required for CLIP
|
||||||
|
clip_free(ctx_clip);
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||||
|
model_params.main_gpu = params.main_gpu;
|
||||||
|
model_params.tensor_split = params.tensor_split;
|
||||||
|
model_params.use_mmap = params.use_mmap;
|
||||||
|
model_params.use_mlock = params.use_mlock;
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
|
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
|
||||||
|
ctx_params.n_threads = params.n_threads;
|
||||||
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
ctx_params.seed = params.seed;
|
||||||
|
|
||||||
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx_llama == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
||||||
|
const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
|
if (n_img_embd != n_llama_embd) {
|
||||||
|
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
|
||||||
|
|
||||||
|
llama_free(ctx_llama);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_backend_free();
|
||||||
|
free(image_embd);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||||
|
|
||||||
|
int n_past = 0;
|
||||||
|
|
||||||
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
|
|
||||||
|
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
|
||||||
|
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
|
||||||
|
eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
|
||||||
|
|
||||||
|
// generate the response
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("prompt: '%s'\n", params.prompt.c_str());
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
const char * tmp = sample(ctx_llama, params, &n_past);
|
||||||
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
|
|
||||||
|
printf("%s", tmp);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
{
|
||||||
|
const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
|
||||||
|
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx_llama);
|
||||||
|
|
||||||
|
llama_free(ctx_llama);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_backend_free();
|
||||||
|
free(image_embd);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -3,7 +3,6 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -109,6 +108,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
|
@ -179,7 +179,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -237,19 +237,19 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
embd_inp.push_back(llama_token_bos(ctx));
|
embd_inp.push_back(llama_token_bos(ctx));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
|
@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
|
@ -296,6 +296,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
|
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOGLN(
|
||||||
|
@ -316,11 +319,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
|
@ -343,7 +346,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -379,6 +382,12 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
for (const auto & antiprompt : params.antiprompt) {
|
for (const auto & antiprompt : params.antiprompt) {
|
||||||
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
|
if (params.verbose_prompt) {
|
||||||
|
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -388,46 +397,29 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
|
if (params.verbose_prompt) {
|
||||||
|
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
|
if (params.verbose_prompt) {
|
||||||
|
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
struct llama_grammar * grammar = NULL;
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
|
||||||
|
|
||||||
if (!params.grammar.empty()) {
|
|
||||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
|
||||||
// will be empty (default) if there are parse errors
|
|
||||||
if (parsed_grammar.rules.empty()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
LOG_TEE("%s: grammar:\n", __func__);
|
|
||||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
|
||||||
LOG_TEE("\n");
|
|
||||||
|
|
||||||
{
|
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
||||||
grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: replace with ring-buffer
|
|
||||||
std::vector<llama_token> last_tokens(n_ctx);
|
|
||||||
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char *control_message;
|
const char *control_message;
|
||||||
if (params.multiline_input) {
|
if (params.multiline_input) {
|
||||||
|
@ -467,10 +459,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> embd_guidance;
|
std::vector<llama_token> embd_guidance;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
|
@ -517,7 +506,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||||
|
|
||||||
LOG("clear session path\n");
|
LOG("clear session path\n");
|
||||||
path_session.clear();
|
path_session.clear();
|
||||||
|
@ -543,14 +532,10 @@ int main(int argc, char ** argv) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
embd.erase(embd.begin(), embd.begin() + i);
|
embd.erase(embd.begin(), embd.begin() + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the session from the KV cache
|
|
||||||
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate tokens in batches
|
// evaluate tokens in batches
|
||||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
int input_size = 0;
|
int input_size = 0;
|
||||||
llama_token * input_buf = NULL;
|
llama_token * input_buf = NULL;
|
||||||
|
@ -572,7 +557,7 @@ int main(int argc, char ** argv) {
|
||||||
input_buf = embd_guidance.data();
|
input_buf = embd_guidance.data();
|
||||||
input_size = embd_guidance.size();
|
input_size = embd_guidance.size();
|
||||||
|
|
||||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
|
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
||||||
} else {
|
} else {
|
||||||
input_buf = embd.data();
|
input_buf = embd.data();
|
||||||
input_size = embd.size();
|
input_size = embd.size();
|
||||||
|
@ -595,7 +580,7 @@ int main(int argc, char ** argv) {
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_TEE("%s : failed to eval\n", __func__);
|
||||||
|
@ -625,12 +610,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||||
last_tokens.push_back(id);
|
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
|
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
|
@ -646,8 +630,14 @@ int main(int argc, char ** argv) {
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
last_tokens.erase(last_tokens.begin());
|
|
||||||
last_tokens.push_back(embd_inp[n_consumed]);
|
// GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
|
||||||
|
// Most likely will remove this in the future to avoid exposing "prev"
|
||||||
|
// Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
|
||||||
|
// penalty will be applied only based on the tokens generated by the model.
|
||||||
|
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||||
|
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
break;
|
break;
|
||||||
|
@ -680,7 +670,7 @@ int main(int argc, char ** argv) {
|
||||||
// check for reverse prompt
|
// check for reverse prompt
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
std::string last_output;
|
std::string last_output;
|
||||||
for (auto id : last_tokens) {
|
for (auto id : ctx_sampling->prev) {
|
||||||
last_output += llama_token_to_piece(ctx, id);
|
last_output += llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -709,13 +699,13 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// deal with end of text token in interactive mode
|
// deal with end of text token in interactive mode
|
||||||
if (last_tokens.back() == llama_token_eos(ctx)) {
|
if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||||
LOG("found EOS token\n");
|
LOG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
// tokenize and inject first reverse prompt
|
// tokenize and inject first reverse prompt
|
||||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
|
||||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
@ -742,8 +732,7 @@ int main(int argc, char ** argv) {
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
buffer += params.input_prefix;
|
printf("%s", params.input_prefix.c_str());
|
||||||
printf("%s", buffer.c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// color user input only
|
// color user input only
|
||||||
|
@ -765,7 +754,6 @@ int main(int argc, char ** argv) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
buffer += params.input_suffix;
|
|
||||||
printf("%s", params.input_suffix.c_str());
|
printf("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -780,10 +768,14 @@ int main(int argc, char ** argv) {
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||||
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
|
||||||
|
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
|
||||||
|
|
||||||
// instruct mode: insert response suffix
|
// instruct mode: insert response suffix
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
|
@ -808,15 +800,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
// reset grammar state if we're restarting generation
|
llama_sampling_reset(ctx_sampling);
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_grammar_free(grammar);
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
||||||
grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(), grammar_rules.size(),
|
|
||||||
parsed_grammar.symbol_ids.at("root"));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
|
@ -848,9 +832,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
if (grammar != NULL) {
|
llama_sampling_free(ctx_sampling);
|
||||||
llama_grammar_free(grammar);
|
|
||||||
}
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
|
|
@ -51,6 +51,12 @@ static std::vector<std::string> k_prompts = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct client {
|
struct client {
|
||||||
|
~client() {
|
||||||
|
if (ctx_sampling) {
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int32_t id = 0;
|
int32_t id = 0;
|
||||||
|
|
||||||
llama_seq_id seq_id = -1;
|
llama_seq_id seq_id = -1;
|
||||||
|
@ -68,7 +74,7 @@ struct client {
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string response;
|
std::string response;
|
||||||
|
|
||||||
std::vector<llama_token> tokens_prev;
|
struct llama_sampling_context * ctx_sampling = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_date_time() {
|
static void print_date_time() {
|
||||||
|
@ -145,20 +151,15 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
const int n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
std::vector<client> clients(n_clients);
|
std::vector<client> clients(n_clients);
|
||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = i;
|
client.id = i;
|
||||||
client.tokens_prev.resize(std::max(256, params.n_predict));
|
client.ctx_sampling = llama_sampling_init(params);
|
||||||
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
||||||
const int32_t n_tokens_system = tokens_system.size();
|
const int32_t n_tokens_system = tokens_system.size();
|
||||||
|
@ -167,7 +168,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||||
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
|
||||||
|
|
||||||
int32_t n_total_prompt = 0;
|
int32_t n_total_prompt = 0;
|
||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
|
@ -182,13 +183,8 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
||||||
|
|
||||||
batch.n_tokens = n_tokens_system;
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||||
|
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
|
||||||
batch.token[i] = tokens_system[i];
|
|
||||||
batch.pos[i] = i;
|
|
||||||
batch.seq_id[i] = 0;
|
|
||||||
batch.logits[i] = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
@ -207,7 +203,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_TEE("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
batch.n_tokens = 0;
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
for (auto & client : clients) {
|
for (auto & client : clients) {
|
||||||
|
@ -215,15 +211,11 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.token [batch.n_tokens] = client.sampled;
|
|
||||||
batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
|
|
||||||
batch.seq_id[batch.n_tokens] = client.id;
|
|
||||||
batch.logits[batch.n_tokens] = true;
|
|
||||||
|
|
||||||
client.n_decoded += 1;
|
|
||||||
client.i_batch = batch.n_tokens;
|
client.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
batch.n_tokens += 1;
|
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
|
||||||
|
|
||||||
|
client.n_decoded += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (batch.n_tokens == 0) {
|
if (batch.n_tokens == 0) {
|
||||||
|
@ -248,18 +240,14 @@ int main(int argc, char ** argv) {
|
||||||
client.prompt = client.input + "\nAssistant:";
|
client.prompt = client.input + "\nAssistant:";
|
||||||
client.response = "";
|
client.response = "";
|
||||||
|
|
||||||
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
|
llama_sampling_reset(client.ctx_sampling);
|
||||||
|
|
||||||
// do not prepend BOS because we have a system prompt!
|
// do not prepend BOS because we have a system prompt!
|
||||||
std::vector<llama_token> tokens_prompt;
|
std::vector<llama_token> tokens_prompt;
|
||||||
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||||
batch.token [batch.n_tokens] = tokens_prompt[i];
|
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
|
||||||
batch.pos [batch.n_tokens] = i + n_tokens_system;
|
|
||||||
batch.seq_id[batch.n_tokens] = client.id;
|
|
||||||
batch.logits[batch.n_tokens] = false;
|
|
||||||
batch.n_tokens += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
|
@ -302,11 +290,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
batch.token + i,
|
batch.token + i,
|
||||||
nullptr,
|
nullptr,
|
||||||
batch.pos + i,
|
batch.pos + i,
|
||||||
batch.seq_id + i,
|
batch.n_seq_id + i,
|
||||||
batch.logits + i,
|
batch.seq_id + i,
|
||||||
|
batch.logits + i,
|
||||||
0, 0, 0, // unused
|
0, 0, 0, // unused
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -339,7 +328,9 @@ int main(int argc, char ** argv) {
|
||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
|
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
||||||
|
|
||||||
|
llama_sampling_accept(client.ctx_sampling, ctx, id);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
|
@ -347,11 +338,8 @@ int main(int argc, char ** argv) {
|
||||||
client.t_start_gen = ggml_time_us();
|
client.t_start_gen = ggml_time_us();
|
||||||
}
|
}
|
||||||
|
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
|
||||||
client.tokens_prev.erase(client.tokens_prev.begin());
|
|
||||||
client.tokens_prev.push_back(id);
|
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
client.response += token_str;
|
client.response += token_str;
|
||||||
client.sampled = id;
|
client.sampled = id;
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,7 @@
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
params.seed = 42;
|
|
||||||
params.n_threads = 4;
|
|
||||||
params.repeat_last_n = 64;
|
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -24,56 +22,49 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto n_past = 0;
|
auto n_past = 0;
|
||||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
|
||||||
|
std::string result0;
|
||||||
|
std::string result1;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (model == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
return 1;
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
}
|
|
||||||
if (ctx == nullptr) {
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tokenize prompt
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||||
auto n_prompt_tokens = tokens.size();
|
|
||||||
if (n_prompt_tokens < 1) {
|
|
||||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// evaluate prompt
|
// evaluate prompt
|
||||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
|
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
|
||||||
|
n_past += tokens.size();
|
||||||
|
|
||||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
// save state (rng, logits, embedding and kv_cache) to file
|
||||||
n_past += n_prompt_tokens;
|
|
||||||
|
|
||||||
const size_t state_size = llama_get_state_size(ctx);
|
|
||||||
uint8_t * state_mem = new uint8_t[state_size];
|
|
||||||
|
|
||||||
// Save state (rng, logits, embedding and kv_cache) to file
|
|
||||||
{
|
{
|
||||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
|
||||||
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
|
||||||
fwrite(state_mem, 1, state_size, fp_write);
|
{
|
||||||
fclose(fp_write);
|
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||||
|
llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
|
||||||
|
fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
|
||||||
|
fclose(fp_write);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// save state (last tokens)
|
// save state (last tokens)
|
||||||
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
|
|
||||||
const auto n_past_saved = n_past;
|
const auto n_past_saved = n_past;
|
||||||
|
|
||||||
// first run
|
// first run
|
||||||
printf("\n%s", params.prompt.c_str());
|
printf("\nfirst run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx);
|
auto * logits = llama_get_logits(ctx);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
auto n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
@ -82,9 +73,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
|
result0 += next_token_str;
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -102,32 +94,28 @@ int main(int argc, char ** argv) {
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
// Load state (rng, logits, embedding and kv_cache) from file
|
printf("\nsecond run: %s", params.prompt.c_str());
|
||||||
{
|
|
||||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
|
||||||
if (state_size != llama_get_state_size(ctx2)) {
|
|
||||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
|
||||||
llama_free(ctx2);
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
if (ret != state_size) {
|
{
|
||||||
|
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
|
||||||
|
|
||||||
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
|
||||||
|
const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
|
if (ret != state_mem.size()) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
llama_set_state_data(ctx2, state_mem.data());
|
||||||
|
|
||||||
fclose(fp_read);
|
fclose(fp_read);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] state_mem;
|
|
||||||
|
|
||||||
// restore state (last tokens)
|
// restore state (last tokens)
|
||||||
last_n_tokens_data = last_n_tokens_data_saved;
|
|
||||||
n_past = n_past_saved;
|
n_past = n_past_saved;
|
||||||
|
|
||||||
// second run
|
// second run
|
||||||
|
@ -142,10 +130,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
|
result1 += next_token_str;
|
||||||
|
|
||||||
|
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -154,10 +143,17 @@ int main(int argc, char ** argv) {
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n");
|
||||||
|
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
if (result0 != result1) {
|
||||||
|
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n%s : success\n", __func__);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,25 +106,25 @@ node index.js
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
|
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||||
|
|
||||||
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
||||||
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
||||||
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
|
|
||||||
|
|
||||||
`stop`: Specify a JSON array of stopping strings.
|
`stop`: Specify a JSON array of stopping strings.
|
||||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||||
|
|
||||||
|
@ -158,6 +158,36 @@ node index.js
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
*Result JSON:*
|
||||||
|
|
||||||
|
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||||
|
|
||||||
|
`content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||||
|
|
||||||
|
`stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||||
|
|
||||||
|
`generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
|
||||||
|
|
||||||
|
`model`: The path to the model loaded with `-m`
|
||||||
|
|
||||||
|
`prompt`: The provided `prompt`
|
||||||
|
|
||||||
|
`stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
||||||
|
|
||||||
|
`stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
|
||||||
|
|
||||||
|
`stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
|
||||||
|
|
||||||
|
`stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
|
||||||
|
|
||||||
|
`timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
|
||||||
|
|
||||||
|
`tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
|
||||||
|
|
||||||
|
`tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||||
|
|
||||||
|
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||||
|
|
||||||
- **POST** `/tokenize`: Tokenize a given text.
|
- **POST** `/tokenize`: Tokenize a given text.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -136,6 +136,11 @@
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fieldset label.slim {
|
||||||
|
margin: 0 0.5em;
|
||||||
|
display: inline;
|
||||||
|
}
|
||||||
|
|
||||||
header, footer {
|
header, footer {
|
||||||
text-align: center;
|
text-align: center;
|
||||||
}
|
}
|
||||||
|
@ -145,6 +150,14 @@
|
||||||
color: #888;
|
color: #888;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.mode-chat textarea[name=prompt] {
|
||||||
|
height: 4.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-completion textarea[name=prompt] {
|
||||||
|
height: 10em;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@keyframes loading-bg-wipe {
|
@keyframes loading-bg-wipe {
|
||||||
0% {
|
0% {
|
||||||
|
@ -187,7 +200,7 @@
|
||||||
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
||||||
historyTemplate: "{{name}}: {{message}}",
|
historyTemplate: "{{name}}: {{message}}",
|
||||||
transcript: [],
|
transcript: [],
|
||||||
type: "chat",
|
type: "chat", // "chat" | "completion"
|
||||||
char: "Llama",
|
char: "Llama",
|
||||||
user: "User",
|
user: "User",
|
||||||
})
|
})
|
||||||
|
@ -365,13 +378,44 @@
|
||||||
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function runLlama(prompt, llamaParams, char) {
|
||||||
|
const currentMessages = [];
|
||||||
|
const history = session.value.transcript;
|
||||||
|
if (controller.value) {
|
||||||
|
throw new Error("already running");
|
||||||
|
}
|
||||||
|
controller.value = new AbortController();
|
||||||
|
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
|
||||||
|
const data = chunk.data;
|
||||||
|
|
||||||
|
if (data.stop) {
|
||||||
|
while (
|
||||||
|
currentMessages.length > 0 &&
|
||||||
|
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
||||||
|
) {
|
||||||
|
currentMessages.pop();
|
||||||
|
}
|
||||||
|
transcriptUpdate([...history, [char, currentMessages]])
|
||||||
|
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
||||||
|
} else {
|
||||||
|
currentMessages.push(data);
|
||||||
|
transcriptUpdate([...history, [char, currentMessages]])
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.timings) {
|
||||||
|
llamaStats.value = data.timings;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
controller.value = null;
|
||||||
|
}
|
||||||
|
|
||||||
// send message to server
|
// send message to server
|
||||||
const chat = async (msg) => {
|
const chat = async (msg) => {
|
||||||
if (controller.value) {
|
if (controller.value) {
|
||||||
console.log('already running...');
|
console.log('already running...');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
|
||||||
|
|
||||||
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
||||||
|
|
||||||
|
@ -391,55 +435,41 @@
|
||||||
).join("\n"),
|
).join("\n"),
|
||||||
});
|
});
|
||||||
|
|
||||||
const currentMessages = [];
|
await runLlama(prompt, {
|
||||||
const history = session.value.transcript
|
|
||||||
|
|
||||||
const llamaParams = {
|
|
||||||
...params.value,
|
...params.value,
|
||||||
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
||||||
|
}, "{{char}}");
|
||||||
|
}
|
||||||
|
|
||||||
|
const runCompletion = async () => {
|
||||||
|
if (controller.value) {
|
||||||
|
console.log('already running...');
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
const {prompt} = session.value;
|
||||||
|
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
||||||
|
await runLlama(prompt, {
|
||||||
|
...params.value,
|
||||||
|
stop: [],
|
||||||
|
}, "");
|
||||||
|
}
|
||||||
|
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
const stop = (e) => {
|
||||||
const data = chunk.data;
|
e.preventDefault();
|
||||||
|
if (controller.value) {
|
||||||
if (data.stop) {
|
controller.value.abort();
|
||||||
while (
|
controller.value = null;
|
||||||
currentMessages.length > 0 &&
|
|
||||||
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
|
||||||
) {
|
|
||||||
currentMessages.pop();
|
|
||||||
}
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
|
||||||
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
|
||||||
} else {
|
|
||||||
currentMessages.push(data);
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data.timings) {
|
|
||||||
llamaStats.value = data.timings;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
controller.value = null;
|
const reset = (e) => {
|
||||||
|
stop(e);
|
||||||
|
transcriptUpdate([]);
|
||||||
}
|
}
|
||||||
|
|
||||||
function MessageInput() {
|
function MessageInput() {
|
||||||
const message = useSignal("")
|
const message = useSignal("")
|
||||||
|
|
||||||
const stop = (e) => {
|
|
||||||
e.preventDefault();
|
|
||||||
if (controller.value) {
|
|
||||||
controller.value.abort();
|
|
||||||
controller.value = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const reset = (e) => {
|
|
||||||
stop(e);
|
|
||||||
transcriptUpdate([]);
|
|
||||||
}
|
|
||||||
|
|
||||||
const submit = (e) => {
|
const submit = (e) => {
|
||||||
stop(e);
|
stop(e);
|
||||||
chat(message.value);
|
chat(message.value);
|
||||||
|
@ -474,6 +504,19 @@
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function CompletionControls() {
|
||||||
|
const submit = (e) => {
|
||||||
|
stop(e);
|
||||||
|
runCompletion();
|
||||||
|
}
|
||||||
|
return html`
|
||||||
|
<div>
|
||||||
|
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||||
|
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||||
|
<button onclick=${reset}>Reset</button>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
const ChatLog = (props) => {
|
const ChatLog = (props) => {
|
||||||
const messages = session.value.transcript;
|
const messages = session.value.transcript;
|
||||||
const container = useRef(null)
|
const container = useRef(null)
|
||||||
|
@ -497,7 +540,11 @@
|
||||||
data;
|
data;
|
||||||
message = html`<${Markdownish} text=${template(text)} />`
|
message = html`<${Markdownish} text=${template(text)} />`
|
||||||
}
|
}
|
||||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
if(user) {
|
||||||
|
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||||
|
} else {
|
||||||
|
return html`<p key=${index}>${message}</p>`
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
|
@ -574,18 +621,31 @@
|
||||||
userTemplateAutosave()
|
userTemplateAutosave()
|
||||||
}, [session.value, params.value])
|
}, [session.value, params.value])
|
||||||
|
|
||||||
return html`
|
const GrammarControl = () => (
|
||||||
<form>
|
html`
|
||||||
<fieldset>
|
<div>
|
||||||
<${UserTemplateResetButton}/>
|
<label for="template">Grammar</label>
|
||||||
</fieldset>
|
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
|
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
|
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
<fieldset>
|
const PromptControlFieldSet = () => (
|
||||||
<div>
|
html`
|
||||||
<label for="prompt">Prompt</label>
|
<fieldset>
|
||||||
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
|
<div>
|
||||||
</div>
|
<label htmlFor="prompt">Prompt</label>
|
||||||
</fieldset>
|
<textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
|
||||||
|
</div>
|
||||||
|
</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
const ChatConfigForm = () => (
|
||||||
|
html`
|
||||||
|
${PromptControlFieldSet()}
|
||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
<div>
|
<div>
|
||||||
|
@ -609,15 +669,30 @@
|
||||||
<label for="template">Chat history template</label>
|
<label for="template">Chat history template</label>
|
||||||
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
||||||
</div>
|
</div>
|
||||||
|
${GrammarControl()}
|
||||||
|
</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
const CompletionConfigForm = () => (
|
||||||
|
html`
|
||||||
|
${PromptControlFieldSet()}
|
||||||
|
<fieldset>${GrammarControl()}</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
return html`
|
||||||
|
<form>
|
||||||
|
<fieldset class="two">
|
||||||
|
<${UserTemplateResetButton}/>
|
||||||
<div>
|
<div>
|
||||||
<label for="template">Grammar</label>
|
<label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
|
||||||
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
|
||||||
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
|
||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
||||||
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
||||||
|
@ -851,7 +926,7 @@
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div>
|
<div class="mode-${session.value.type}">
|
||||||
<header>
|
<header>
|
||||||
<h1>llama.cpp</h1>
|
<h1>llama.cpp</h1>
|
||||||
</header>
|
</header>
|
||||||
|
@ -861,7 +936,7 @@
|
||||||
</main>
|
</main>
|
||||||
|
|
||||||
<section id="write">
|
<section id="write">
|
||||||
<${MessageInput} />
|
<${session.value.type === 'chat' ? MessageInput : CompletionControls} />
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<footer>
|
<footer>
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
@ -195,16 +194,13 @@ struct llama_server_context
|
||||||
|
|
||||||
json prompt;
|
json prompt;
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> last_n_tokens;
|
|
||||||
|
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_context *ctx_sampling = nullptr;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
|
||||||
llama_grammar *grammar = nullptr;
|
|
||||||
|
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
|
@ -251,10 +247,10 @@ struct llama_server_context
|
||||||
n_remain = 0;
|
n_remain = 0;
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
if (ctx_sampling != nullptr) {
|
||||||
llama_grammar_free(grammar);
|
llama_sampling_free(ctx_sampling);
|
||||||
grammar = nullptr;
|
|
||||||
}
|
}
|
||||||
|
ctx_sampling = llama_sampling_init(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool loadModel(const gpt_params ¶ms_)
|
bool loadModel(const gpt_params ¶ms_)
|
||||||
|
@ -267,8 +263,6 @@ struct llama_server_context
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
last_n_tokens.resize(n_ctx);
|
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -319,34 +313,26 @@ struct llama_server_context
|
||||||
|
|
||||||
bool loadGrammar()
|
bool loadGrammar()
|
||||||
{
|
{
|
||||||
if (!params.grammar.empty()) {
|
ctx_sampling = llama_sampling_init(params);
|
||||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
|
||||||
// will be empty (default) if there are parse errors
|
|
||||||
if (parsed_grammar.rules.empty()) {
|
|
||||||
LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
|
||||||
|
|
||||||
{
|
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
|
||||||
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
||||||
grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void loadInfill()
|
void loadInfill()
|
||||||
{
|
{
|
||||||
auto prefix_tokens = tokenize(params.input_prefix, true); // always add BOS
|
bool suff_rm_leading_spc = true;
|
||||||
auto suffix_tokens = tokenize(params.input_suffix, true); // always add BOS
|
if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||||
|
params.input_suffix.erase(0, 1);
|
||||||
|
suff_rm_leading_spc = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto prefix_tokens = tokenize(params.input_prefix, false);
|
||||||
|
auto suffix_tokens = tokenize(params.input_suffix, false);
|
||||||
|
const int space_token = 29871;
|
||||||
|
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
|
||||||
|
suffix_tokens.erase(suffix_tokens.begin());
|
||||||
|
}
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
||||||
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
|
||||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
||||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||||
prefix_tokens.push_back(llama_token_middle(ctx));
|
prefix_tokens.push_back(llama_token_middle(ctx));
|
||||||
|
@ -369,7 +355,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
||||||
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
||||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
||||||
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_VERBOSE("input truncated", {
|
||||||
{"n_ctx", params.n_ctx},
|
{"n_ctx", params.n_ctx},
|
||||||
|
@ -384,13 +370,14 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const size_t ps = num_prompt_tokens;
|
const size_t ps = num_prompt_tokens;
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
|
std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
|
||||||
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
|
std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
|
||||||
}
|
}
|
||||||
|
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
|
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
|
@ -398,6 +385,9 @@ struct llama_server_context
|
||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
|
@ -425,7 +415,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
||||||
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
||||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
||||||
std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_VERBOSE("input truncated", {
|
||||||
{"n_ctx", n_ctx},
|
{"n_ctx", n_ctx},
|
||||||
|
@ -440,16 +430,13 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const size_t ps = num_prompt_tokens;
|
const size_t ps = num_prompt_tokens;
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
|
std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
|
||||||
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
|
std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
|
||||||
}
|
}
|
||||||
|
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
|
|
||||||
// since #3228 we now have to manually manage the KV cache
|
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
||||||
|
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
|
@ -457,6 +444,9 @@ struct llama_server_context
|
||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
|
@ -536,27 +526,24 @@ struct llama_server_context
|
||||||
|
|
||||||
{
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
std::vector<llama_token_data> candidates;
|
result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
|
||||||
candidates.reserve(llama_n_vocab(model));
|
|
||||||
|
|
||||||
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
const int32_t n_probs = params.sampling_params.n_probs;
|
||||||
|
if (params.sampling_params.temp <= 0 && n_probs > 0)
|
||||||
const int32_t n_probs = params.n_probs;
|
|
||||||
if (params.temp <= 0 && n_probs > 0)
|
|
||||||
{
|
{
|
||||||
// For llama_sample_token_greedy we need to sort candidates
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
llama_sample_softmax(ctx, &cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
||||||
{
|
{
|
||||||
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
|
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
||||||
}
|
}
|
||||||
|
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
llama_sampling_accept(ctx_sampling, ctx, result.tok);
|
||||||
last_n_tokens.push_back(result.tok);
|
|
||||||
if (tg) {
|
if (tg) {
|
||||||
num_tokens_predicted++;
|
num_tokens_predicted++;
|
||||||
}
|
}
|
||||||
|
@ -619,7 +606,7 @@ struct llama_server_context
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (params.n_probs > 0)
|
if (params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
generated_token_probs.push_back(token_with_probs);
|
generated_token_probs.push_back(token_with_probs);
|
||||||
}
|
}
|
||||||
|
@ -700,15 +687,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf("usage: %s [options]\n", argv0);
|
printf("usage: %s [options]\n", argv0);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
{
|
{
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
|
@ -853,6 +841,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--threads-batch" || arg == "-tb")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_batch = std::stoi(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "-b" || arg == "--batch-size")
|
else if (arg == "-b" || arg == "--batch-size")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -1007,34 +1004,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
|
|
||||||
static json format_generation_settings(llama_server_context &llama)
|
static json format_generation_settings(llama_server_context &llama)
|
||||||
{
|
{
|
||||||
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
|
const auto & sparams = llama.params.sampling_params;
|
||||||
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
|
||||||
|
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
|
||||||
return json{
|
return json{
|
||||||
{"n_ctx", llama.n_ctx},
|
{"n_ctx", llama.n_ctx},
|
||||||
{"model", llama.params.model_alias},
|
{"model", llama.params.model_alias},
|
||||||
{"seed", llama.params.seed},
|
{"seed", llama.params.seed},
|
||||||
{"temp", llama.params.temp},
|
{"temp", sparams.temp},
|
||||||
{"top_k", llama.params.top_k},
|
{"top_k", sparams.top_k},
|
||||||
{"top_p", llama.params.top_p},
|
{"top_p", sparams.top_p},
|
||||||
{"tfs_z", llama.params.tfs_z},
|
{"tfs_z", sparams.tfs_z},
|
||||||
{"typical_p", llama.params.typical_p},
|
{"typical_p", sparams.typical_p},
|
||||||
{"repeat_last_n", llama.params.repeat_last_n},
|
{"repeat_last_n", sparams.repeat_last_n},
|
||||||
{"repeat_penalty", llama.params.repeat_penalty},
|
{"repeat_penalty", sparams.repeat_penalty},
|
||||||
{"presence_penalty", llama.params.presence_penalty},
|
{"presence_penalty", sparams.presence_penalty},
|
||||||
{"frequency_penalty", llama.params.frequency_penalty},
|
{"frequency_penalty", sparams.frequency_penalty},
|
||||||
{"mirostat", llama.params.mirostat},
|
{"mirostat", sparams.mirostat},
|
||||||
{"mirostat_tau", llama.params.mirostat_tau},
|
{"mirostat_tau", sparams.mirostat_tau},
|
||||||
{"mirostat_eta", llama.params.mirostat_eta},
|
{"mirostat_eta", sparams.mirostat_eta},
|
||||||
{"penalize_nl", llama.params.penalize_nl},
|
{"penalize_nl", sparams.penalize_nl},
|
||||||
{"stop", llama.params.antiprompt},
|
{"stop", llama.params.antiprompt},
|
||||||
{"n_predict", llama.params.n_predict},
|
{"n_predict", llama.params.n_predict},
|
||||||
{"n_keep", llama.params.n_keep},
|
{"n_keep", llama.params.n_keep},
|
||||||
{"ignore_eos", ignore_eos},
|
{"ignore_eos", ignore_eos},
|
||||||
{"stream", llama.stream},
|
{"stream", llama.stream},
|
||||||
{"logit_bias", llama.params.logit_bias},
|
{"logit_bias", sparams.logit_bias},
|
||||||
{"n_probs", llama.params.n_probs},
|
{"n_probs", sparams.n_probs},
|
||||||
{"grammar", llama.params.grammar},
|
{"grammar", llama.params.grammar},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1083,7 +1081,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||||
{"timings", format_timings(llama)},
|
{"timings", format_timings(llama)},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1099,7 +1097,7 @@ static json format_partial_response(
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1131,26 +1129,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||||
{
|
{
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
const auto & default_sparams = default_params.sampling_params;
|
||||||
|
auto & sparams = llama.params.sampling_params;
|
||||||
|
|
||||||
llama.stream = json_value(body, "stream", false);
|
llama.stream = json_value(body, "stream", false);
|
||||||
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
||||||
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
|
sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
|
||||||
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
|
sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
|
||||||
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
|
sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
|
sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
|
||||||
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
|
sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
|
||||||
llama.params.temp = json_value(body, "temperature", default_params.temp);
|
sparams.temp = json_value(body, "temperature", default_sparams.temp);
|
||||||
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
|
sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
|
||||||
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
|
sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
|
||||||
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
|
sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
|
||||||
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
|
sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
|
||||||
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
|
sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
|
sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||||
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
if (body.count("prompt") != 0)
|
if (body.count("prompt") != 0)
|
||||||
{
|
{
|
||||||
|
@ -1161,10 +1161,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
llama.prompt = "";
|
llama.prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
llama.params.logit_bias.clear();
|
sparams.logit_bias.clear();
|
||||||
if (json_value(body, "ignore_eos", false))
|
if (json_value(body, "ignore_eos", false))
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto &logit_bias = body.find("logit_bias");
|
const auto &logit_bias = body.find("logit_bias");
|
||||||
|
@ -1180,11 +1180,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
if (el[1].is_number())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = el[1].get<float>();
|
sparams.logit_bias[tok] = el[1].get<float>();
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = -INFINITY;
|
sparams.logit_bias[tok] = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1204,6 +1204,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama.ctx_sampling = llama_sampling_init(llama.params);
|
||||||
|
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1412,7 +1414,7 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
|
|
||||||
auto probs = llama.generated_token_probs;
|
auto probs = llama.generated_token_probs;
|
||||||
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||||
}
|
}
|
||||||
|
@ -1464,7 +1466,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
@ -1585,7 +1587,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
@ -1760,9 +1762,7 @@ int main(int argc, char **argv)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama.grammar != nullptr) {
|
llama_sampling_free(llama.ctx_sampling);
|
||||||
llama_grammar_free(llama.grammar);
|
|
||||||
}
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||||
// create a llama_batch with size 512
|
// create a llama_batch with size 512
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(512, 0);
|
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
batch.n_tokens = tokens_list.size();
|
batch.n_tokens = tokens_list.size();
|
||||||
|
|
|
@ -2,13 +2,25 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
struct seq_draft {
|
||||||
|
bool active = false;
|
||||||
|
bool drafting = false;
|
||||||
|
bool skip = false;
|
||||||
|
|
||||||
|
int i_batch_dft = 0;
|
||||||
|
std::vector<int> i_batch_tgt;
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
struct llama_sampling_context * ctx_sampling;
|
||||||
|
};
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
@ -21,6 +33,13 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// max number of parallel drafting sequences (i.e. tree branches)
|
||||||
|
const int n_seq_dft = params.n_parallel;
|
||||||
|
|
||||||
|
// TODO: make this configurable
|
||||||
|
const float p_accept = 0.80f;
|
||||||
|
const float p_split = 0.10f;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("speculative", "log"));
|
log_set_target(log_filename_generator("speculative", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
|
@ -77,8 +96,6 @@ int main(int argc, char ** argv) {
|
||||||
const auto t_enc_end = ggml_time_us();
|
const auto t_enc_end = ggml_time_us();
|
||||||
|
|
||||||
// the 2 models should have the same vocab
|
// the 2 models should have the same vocab
|
||||||
const int n_ctx = llama_n_ctx(ctx_tgt);
|
|
||||||
const int n_vocab = llama_n_vocab(model_tgt);
|
|
||||||
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
|
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
|
||||||
|
|
||||||
// how many tokens to draft each time
|
// how many tokens to draft each time
|
||||||
|
@ -91,58 +108,58 @@ int main(int argc, char ** argv) {
|
||||||
int n_past_tgt = inp.size();
|
int n_past_tgt = inp.size();
|
||||||
int n_past_dft = inp.size();
|
int n_past_dft = inp.size();
|
||||||
|
|
||||||
std::vector<llama_token> drafted;
|
|
||||||
|
|
||||||
std::vector<llama_token> last_tokens(n_ctx);
|
|
||||||
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
|
||||||
|
|
||||||
for (auto & id : inp) {
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
|
||||||
last_tokens.push_back(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
// used to determine end of generation
|
// used to determine end of generation
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
// grammar stuff
|
// target model sampling context
|
||||||
struct llama_grammar * grammar_dft = NULL;
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||||
struct llama_grammar * grammar_tgt = NULL;
|
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
// draft sequence data
|
||||||
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
// if requested - load the grammar, error checking is omitted for brevity
|
params.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||||
if (!params.grammar.empty()) {
|
params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
|
||||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
|
||||||
// will be empty (default) if there are parse errors
|
|
||||||
if (parsed_grammar.rules.empty()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
drafts[s].ctx_sampling = llama_sampling_init(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||||
|
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
|
||||||
|
|
||||||
const auto t_dec_start = ggml_time_us();
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
while (true) {
|
// sample from the last token of the prompt
|
||||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
drafts[0].i_batch_tgt.resize(1);
|
||||||
|
drafts[0].i_batch_tgt[0] = 0;
|
||||||
|
|
||||||
int i_dft = 0;
|
while (true) {
|
||||||
|
// print current draft sequences
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
if (!drafts[s].active) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto & tokens = drafts[s].tokens;
|
||||||
|
|
||||||
|
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int i_dft = 0;
|
||||||
|
int s_keep = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
llama_sampling_accept(ctx_sampling, ctx_tgt, id);
|
||||||
last_tokens.erase(last_tokens.begin());
|
|
||||||
last_tokens.push_back(id);
|
|
||||||
|
|
||||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
|
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
printf("%s", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
@ -152,53 +169,67 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
++n_predict;
|
++n_predict;
|
||||||
|
|
||||||
// check if the draft matches the target
|
// check if the target token matches any of the drafts
|
||||||
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
|
|
||||||
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
|
||||||
++n_accept;
|
|
||||||
++n_past_tgt;
|
|
||||||
++n_past_dft;
|
|
||||||
++i_dft;
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// the drafted token was rejected or we are out of drafted tokens
|
|
||||||
|
|
||||||
if (i_dft < (int) drafted.size()) {
|
|
||||||
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
|
|
||||||
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
|
|
||||||
} else {
|
|
||||||
LOG("out of drafted tokens\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
|
||||||
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
|
||||||
++n_past_dft;
|
|
||||||
|
|
||||||
// heuristic for n_draft
|
|
||||||
{
|
{
|
||||||
const int n_draft_cur = (int) drafted.size();
|
bool matches = false;
|
||||||
const bool all_accepted = i_dft == n_draft_cur;
|
|
||||||
|
|
||||||
LOG("n_draft = %d\n", n_draft);
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
LOG("n_draft_cur = %d\n", n_draft_cur);
|
if (!drafts[s].active) {
|
||||||
LOG("i_dft = %d\n", i_dft);
|
continue;
|
||||||
LOG("all_accepted = %d\n", all_accepted);
|
}
|
||||||
|
|
||||||
if (all_accepted && n_draft == n_draft_cur) {
|
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
|
||||||
LOG(" - max drafted tokens accepted - n_draft += 8\n");
|
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
|
||||||
n_draft = std::min(30, n_draft + 8);
|
|
||||||
} else if (all_accepted) {
|
s_keep = s;
|
||||||
LOG(" - partially drafted tokens accepted - no change\n");
|
matches = true;
|
||||||
} else {
|
} else {
|
||||||
LOG(" - drafted token rejected - n_draft -= 1\n");
|
drafts[s].active = false;
|
||||||
n_draft = std::max(2, n_draft - 1);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matches) {
|
||||||
|
++n_accept;
|
||||||
|
++n_past_tgt;
|
||||||
|
++n_past_dft;
|
||||||
|
++i_dft;
|
||||||
|
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
drafted.clear();
|
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
drafted.push_back(id);
|
|
||||||
|
// TODO: simplify
|
||||||
|
{
|
||||||
|
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
||||||
|
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||||
|
llama_kv_cache_seq_keep(ctx_dft, 0);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||||
|
llama_kv_cache_seq_keep(ctx_tgt, s_keep);
|
||||||
|
llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||||
|
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
drafts[s].active = false;
|
||||||
|
drafts[s].tokens.clear();
|
||||||
|
drafts[s].i_batch_tgt.clear();
|
||||||
|
}
|
||||||
|
// note: will be erased after the speculation phase
|
||||||
|
drafts[0].tokens.push_back(id);
|
||||||
|
drafts[0].i_batch_tgt.push_back(0);
|
||||||
|
|
||||||
|
llama_batch_clear(batch_dft);
|
||||||
|
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
|
llama_decode (ctx_dft, batch_dft);
|
||||||
|
|
||||||
|
++n_past_dft;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -207,72 +238,151 @@ int main(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (grammar_tgt) {
|
llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
|
||||||
if (grammar_dft) {
|
|
||||||
llama_grammar_free(grammar_dft);
|
|
||||||
}
|
|
||||||
grammar_dft = llama_grammar_copy(grammar_tgt);
|
|
||||||
|
|
||||||
LOG("copied target grammar to draft grammar\n");
|
int n_seq_cur = 1;
|
||||||
}
|
|
||||||
|
|
||||||
// sample n_draft tokens from the draft model using greedy decoding
|
|
||||||
int n_past_cur = n_past_dft;
|
int n_past_cur = n_past_dft;
|
||||||
|
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
drafts[s].active = false;
|
||||||
|
drafts[s].drafting = false;
|
||||||
|
}
|
||||||
|
drafts[0].active = true;
|
||||||
|
drafts[0].drafting = true;
|
||||||
|
drafts[0].i_batch_dft = 0;
|
||||||
|
|
||||||
|
llama_batch_clear(batch_tgt);
|
||||||
|
llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
||||||
|
|
||||||
|
// sample n_draft tokens from the draft model using tree-based sampling
|
||||||
for (int i = 0; i < n_draft; ++i) {
|
for (int i = 0; i < n_draft; ++i) {
|
||||||
float * logits = llama_get_logits(ctx_dft);
|
batch_dft.n_tokens = 0;
|
||||||
|
|
||||||
candidates.clear();
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
drafts[s].skip = false;
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
if (!drafts[s].drafting || drafts[s].skip) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (grammar_dft != NULL) {
|
llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
|
||||||
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
|
|
||||||
|
const auto & cur_p = drafts[s].ctx_sampling->cur;
|
||||||
|
|
||||||
|
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
|
||||||
|
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
|
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur_p[0].p < p_accept) {
|
||||||
|
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
|
||||||
|
drafts[s].drafting = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> sa(1, s);
|
||||||
|
|
||||||
|
// attempt to split the branch if the probability is high enough
|
||||||
|
for (int f = 1; f < 8; ++f) {
|
||||||
|
if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
|
||||||
|
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||||
|
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||||
|
|
||||||
|
// all previous tokens from this branch are now also part of the new branch
|
||||||
|
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
||||||
|
for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
|
||||||
|
if (batch_tgt.seq_id[t][p] == s) {
|
||||||
|
batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
|
||||||
|
batch_tgt.n_seq_id[t]++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy the draft state
|
||||||
|
drafts[n_seq_cur].active = true;
|
||||||
|
drafts[n_seq_cur].drafting = true;
|
||||||
|
drafts[n_seq_cur].skip = true;
|
||||||
|
|
||||||
|
drafts[n_seq_cur].tokens = drafts[s].tokens;
|
||||||
|
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
||||||
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
||||||
|
|
||||||
|
llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
|
||||||
|
|
||||||
|
sa.push_back(n_seq_cur);
|
||||||
|
|
||||||
|
n_seq_cur++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add drafted token for each sequence
|
||||||
|
for (int is = 0; is < (int) sa.size(); ++is) {
|
||||||
|
const llama_token id = cur_p[is].id;
|
||||||
|
|
||||||
|
const int s = sa[is];
|
||||||
|
|
||||||
|
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
|
||||||
|
|
||||||
|
drafts[s].tokens.push_back(id);
|
||||||
|
|
||||||
|
// add unique drafted tokens to the target batch
|
||||||
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||||
|
|
||||||
|
llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
||||||
|
|
||||||
|
// add the token to the batch for batched decoding with the draft model
|
||||||
|
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
||||||
|
|
||||||
|
llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
||||||
|
|
||||||
|
if (batch_tgt.n_tokens > n_draft) {
|
||||||
|
drafts[s].drafting = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// computes softmax and sorts the candidates
|
// no sequence is drafting anymore
|
||||||
llama_sample_softmax(ctx_dft, &cur_p);
|
if (batch_dft.n_tokens == 0) {
|
||||||
|
|
||||||
for (int i = 0; i < 3; ++i) {
|
|
||||||
LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: better logic?
|
|
||||||
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
|
|
||||||
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// drafted token
|
// evaluate the drafted tokens on the draft model
|
||||||
const llama_token id = cur_p.data[0].id;
|
llama_decode(ctx_dft, batch_dft);
|
||||||
|
++n_past_cur;
|
||||||
drafted.push_back(id);
|
|
||||||
++n_drafted;
|
++n_drafted;
|
||||||
|
|
||||||
// no need to evaluate the last drafted token, since we won't use the result
|
if (batch_tgt.n_tokens > n_draft) {
|
||||||
if (i == n_draft - 1) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate the drafted token on the draft model
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
|
|
||||||
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
|
||||||
++n_past_cur;
|
|
||||||
|
|
||||||
if (grammar_dft != NULL) {
|
|
||||||
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate the target model on the drafted tokens
|
// evaluate the target model on the drafted tokens
|
||||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
|
{
|
||||||
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||||
++n_past_tgt;
|
for (int s = 1; s < n_seq_dft; ++s) {
|
||||||
|
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||||
|
}
|
||||||
|
|
||||||
// the first token is always proposed by the traget model before the speculation loop
|
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
||||||
drafted.erase(drafted.begin());
|
llama_decode(ctx_tgt, batch_tgt);
|
||||||
|
++n_past_tgt;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the first token is always proposed by the traget model before the speculation loop so we erase it here
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
if (!drafts[s].active) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
drafts[s].tokens.erase(drafts[s].tokens.begin());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto t_dec_end = ggml_time_us();
|
auto t_dec_end = ggml_time_us();
|
||||||
|
@ -280,9 +390,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
// TODO: make sure these numbers are computed correctly
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_TEE("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
|
@ -296,16 +405,19 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_TEE("\ntarget:\n");
|
||||||
llama_print_timings(ctx_tgt);
|
llama_print_timings(ctx_tgt);
|
||||||
|
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
llama_sampling_free(drafts[s].ctx_sampling);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_free(batch_dft);
|
||||||
|
|
||||||
llama_free(ctx_tgt);
|
llama_free(ctx_tgt);
|
||||||
llama_free_model(model_tgt);
|
llama_free_model(model_tgt);
|
||||||
|
|
||||||
llama_free(ctx_dft);
|
llama_free(ctx_dft);
|
||||||
llama_free_model(model_dft);
|
llama_free_model(model_dft);
|
||||||
|
|
||||||
if (grammar_dft != NULL) {
|
|
||||||
llama_grammar_free(grammar_dft);
|
|
||||||
llama_grammar_free(grammar_tgt);
|
|
||||||
}
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
|
|
@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
|
||||||
set_param_model(model);
|
set_param_model(model);
|
||||||
|
|
||||||
// measure data size
|
// measure data size
|
||||||
struct ggml_allocr * alloc = NULL;
|
size_t size = 0;
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
alloc_model(alloc, model);
|
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
|
||||||
|
}
|
||||||
|
|
||||||
// allocate data
|
// allocate data
|
||||||
model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
|
struct ggml_allocr * alloc = NULL;
|
||||||
ggml_allocr_free(alloc);
|
model->data.resize(size + tensor_alignment);
|
||||||
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
||||||
alloc_model(alloc, model);
|
alloc_model(alloc, model);
|
||||||
ggml_allocr_free(alloc);
|
ggml_allocr_free(alloc);
|
||||||
|
@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
||||||
|
|
||||||
// measure required memory for input tensors
|
// measure required memory for input tensors
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
|
||||||
ggml_allocr_alloc(alloc, tokens_input);
|
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
|
||||||
ggml_allocr_alloc(alloc, target_probs);
|
tensor_alignment;
|
||||||
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
||||||
|
|
||||||
// allocate input tensors
|
// allocate input tensors
|
||||||
|
|
|
@ -386,7 +386,7 @@ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||||
assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
|
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
||||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
47
ggml-cuda.cu
47
ggml-cuda.cu
|
@ -415,6 +415,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
#define CUDA_SILU_BLOCK_SIZE 256
|
#define CUDA_SILU_BLOCK_SIZE 256
|
||||||
#define CUDA_CPY_BLOCK_SIZE 32
|
#define CUDA_CPY_BLOCK_SIZE 32
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||||
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
||||||
#define CUDA_ROPE_BLOCK_SIZE 256
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
||||||
#define CUDA_ALIBI_BLOCK_SIZE 32
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
|
@ -4585,6 +4586,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
||||||
dst[i] = scale * x[i];
|
dst[i] = scale * x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||||
|
}
|
||||||
|
|
||||||
template<int qk, int qr, dequantize_kernel_t dq>
|
template<int qk, int qr, dequantize_kernel_t dq>
|
||||||
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
||||||
|
@ -5475,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
||||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
||||||
|
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
||||||
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||||
|
@ -6419,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||||
float max_bias;
|
float max_bias;
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
GGML_ASSERT(ne01 + n_past == ne00);
|
//GGML_ASSERT(ne01 + n_past == ne00);
|
||||||
GGML_ASSERT(n_head == ne02);
|
GGML_ASSERT(n_head == ne02);
|
||||||
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||||
|
@ -6500,6 +6515,24 @@ inline void ggml_cuda_op_scale(
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void ggml_cuda_op_clamp(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const float min = ((float *) dst->op_params)[0];
|
||||||
|
const float max = ((float *) dst->op_params)[1];
|
||||||
|
|
||||||
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
||||||
const int64_t nrows0 = ggml_nrows(src0);
|
const int64_t nrows0 = ggml_nrows(src0);
|
||||||
|
|
||||||
|
@ -7061,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
@ -7470,6 +7507,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
func = ggml_cuda_scale;
|
func = ggml_cuda_scale;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CLAMP:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cuda_clamp;
|
||||||
|
break;
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
func = ggml_cuda_cpy;
|
func = ggml_cuda_cpy;
|
||||||
break;
|
break;
|
||||||
|
|
71
ggml-metal.m
71
ggml-metal.m
|
@ -73,6 +73,8 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_q5_0);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_q5_1);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q8_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q8_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
||||||
|
@ -87,6 +89,8 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
||||||
|
@ -97,6 +101,8 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
|
||||||
|
@ -254,6 +260,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_q5_0);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_q5_1);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q8_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q8_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
||||||
|
@ -268,6 +276,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
||||||
|
@ -278,8 +288,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||||
|
@ -346,6 +358,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
||||||
|
GGML_METAL_DEL_KERNEL(get_rows_q5_0);
|
||||||
|
GGML_METAL_DEL_KERNEL(get_rows_q5_1);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q8_0);
|
GGML_METAL_DEL_KERNEL(get_rows_q8_0);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q2_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q2_K);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q3_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q3_K);
|
||||||
|
@ -360,6 +374,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
||||||
|
@ -370,8 +386,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||||
|
@ -779,8 +797,8 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONCAT:
|
case GGML_OP_CONCAT:
|
||||||
{
|
{
|
||||||
|
const int64_t nb = ne00;
|
||||||
|
|
||||||
int64_t nb = ne00;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_concat];
|
[encoder setComputePipelineState:ctx->pipeline_concat];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
@ -812,6 +830,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
|
@ -909,9 +928,10 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst)/4;
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
GGML_ASSERT(n % 4 == 0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||||
|
@ -921,9 +941,10 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst)/4;
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
GGML_ASSERT(n % 4 == 0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
{
|
{
|
||||||
|
@ -941,9 +962,10 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst)/4;
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
GGML_ASSERT(n % 4 == 0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -1040,7 +1062,7 @@ void ggml_metal_graph_compute(
|
||||||
!ggml_is_transposed(src0) &&
|
!ggml_is_transposed(src0) &&
|
||||||
!ggml_is_transposed(src1) &&
|
!ggml_is_transposed(src1) &&
|
||||||
src1t == GGML_TYPE_F32 &&
|
src1t == GGML_TYPE_F32 &&
|
||||||
ne00 % 32 == 0 &&
|
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||||
ne11 > ne11_mm_min) {
|
ne11 > ne11_mm_min) {
|
||||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
|
@ -1048,6 +1070,8 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||||
|
case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
|
||||||
|
case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
|
||||||
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
|
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
|
||||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
||||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
||||||
|
@ -1117,6 +1141,24 @@ void ggml_metal_graph_compute(
|
||||||
nth1 = 8;
|
nth1 = 8;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
|
nth0 = 8;
|
||||||
|
nth1 = 8;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
|
nth0 = 8;
|
||||||
|
nth1 = 8;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
|
||||||
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
@ -1197,7 +1239,8 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
|
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
|
@ -1229,6 +1272,8 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||||
|
case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
|
||||||
|
case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
|
||||||
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
|
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
|
||||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
||||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
||||||
|
@ -1251,6 +1296,8 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
@ -1293,7 +1340,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
const int nth = MIN(1024, ne00);
|
const int nth = MIN(1024, ne00);
|
||||||
|
|
||||||
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||||
float max_bias;
|
float max_bias;
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||||
|
|
181
ggml-metal.metal
181
ggml-metal.metal
|
@ -18,6 +18,21 @@ typedef struct {
|
||||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
} block_q4_1;
|
} block_q4_1;
|
||||||
|
|
||||||
|
#define QK5_0 32
|
||||||
|
typedef struct {
|
||||||
|
half d; // delta
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||||
|
} block_q5_0;
|
||||||
|
|
||||||
|
#define QK5_1 32
|
||||||
|
typedef struct {
|
||||||
|
half d; // delta
|
||||||
|
half m; // min
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||||
|
} block_q5_1;
|
||||||
|
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
|
@ -345,10 +360,11 @@ kernel void kernel_rms_norm(
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
uint tiisg[[thread_index_in_simdgroup]],
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
uint ntg[[threads_per_threadgroup]]) {
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||||
device const float * x_scalar = (device const float *) x;
|
device const float * x_scalar = (device const float *) x;
|
||||||
float4 sumf=0;
|
|
||||||
float all_sum=0;
|
float4 sumf = 0;
|
||||||
|
float all_sum = 0;
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
|
@ -361,6 +377,7 @@ kernel void kernel_rms_norm(
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
// broadcast, simd group number is ntg / 32
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
if (tpitg < i) {
|
if (tpitg < i) {
|
||||||
|
@ -368,7 +385,9 @@ kernel void kernel_rms_norm(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (tpitg == 0) {
|
if (tpitg == 0) {
|
||||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
|
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
||||||
|
sum[0] += x_scalar[i];
|
||||||
|
}
|
||||||
sum[0] /= ne00;
|
sum[0] /= ne00;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -383,7 +402,9 @@ kernel void kernel_rms_norm(
|
||||||
y[i00] = x[i00] * scale;
|
y[i00] = x[i00] * scale;
|
||||||
}
|
}
|
||||||
if (tpitg == 0) {
|
if (tpitg == 0) {
|
||||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
|
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
||||||
|
y_scalar[i00] = x_scalar[i00] * scale;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -393,8 +414,11 @@ kernel void kernel_rms_norm(
|
||||||
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||||
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
|
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
|
||||||
float d = qb_curr->d;
|
float d = qb_curr->d;
|
||||||
|
|
||||||
float2 acc = 0.f;
|
float2 acc = 0.f;
|
||||||
|
|
||||||
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
|
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
|
||||||
|
|
||||||
for (int i = 0; i < 8; i+=2) {
|
for (int i = 0; i < 8; i+=2) {
|
||||||
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
||||||
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||||
|
@ -411,8 +435,11 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
|
||||||
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
|
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
|
||||||
float d = qb_curr->d;
|
float d = qb_curr->d;
|
||||||
float m = qb_curr->m;
|
float m = qb_curr->m;
|
||||||
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
|
|
||||||
float2 acc = 0.f;
|
float2 acc = 0.f;
|
||||||
|
|
||||||
|
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
|
||||||
|
|
||||||
for (int i = 0; i < 8; i+=2) {
|
for (int i = 0; i < 8; i+=2) {
|
||||||
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
||||||
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||||
|
@ -422,6 +449,49 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
|
||||||
return d * (acc[0] + acc[1]) + sumy * m;
|
return d * (acc[0] + acc[1]) + sumy * m;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||||
|
// il indicates where the q5 quants begin (0 or QK5_0/4)
|
||||||
|
// we assume that the yl's have been multiplied with the appropriate scale factor
|
||||||
|
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||||
|
inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
|
||||||
|
float d = qb_curr->d;
|
||||||
|
|
||||||
|
float2 acc = 0.f;
|
||||||
|
|
||||||
|
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 3 + il/2);
|
||||||
|
const uint32_t qh = *((device const uint32_t *)qb_curr->qh);
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i+=2) {
|
||||||
|
acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010))
|
||||||
|
+ yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000));
|
||||||
|
acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
|
||||||
|
+ yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
|
||||||
|
}
|
||||||
|
return d * (sumy * -16.f + acc[0] + acc[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||||
|
// il indicates where the q5 quants begin (0 or QK5_1/4)
|
||||||
|
// we assume that the yl's have been multiplied with the appropriate scale factor
|
||||||
|
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||||
|
inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
|
||||||
|
float d = qb_curr->d;
|
||||||
|
float m = qb_curr->m;
|
||||||
|
|
||||||
|
float2 acc = 0.f;
|
||||||
|
|
||||||
|
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 4 + il/2);
|
||||||
|
const uint32_t qh = *((device const uint32_t *)qb_curr->qh);
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i+=2) {
|
||||||
|
acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010))
|
||||||
|
+ yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000));
|
||||||
|
acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
|
||||||
|
+ yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
|
||||||
|
}
|
||||||
|
return d * (acc[0] + acc[1]) + sumy * m;
|
||||||
|
}
|
||||||
|
|
||||||
// putting them in the kernel cause a significant performance penalty
|
// putting them in the kernel cause a significant performance penalty
|
||||||
#define N_DST 4 // each SIMD group works on 4 rows
|
#define N_DST 4 // each SIMD group works on 4 rows
|
||||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||||
|
@ -519,6 +589,43 @@ kernel void kernel_mul_mv_q4_1_f32(
|
||||||
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_mul_mv_q5_0_f32(
|
||||||
|
device const void * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01[[buffer(4)]],
|
||||||
|
constant int64_t & ne02[[buffer(5)]],
|
||||||
|
constant int64_t & ne10[[buffer(9)]],
|
||||||
|
constant int64_t & ne12[[buffer(11)]],
|
||||||
|
constant int64_t & ne0[[buffer(15)]],
|
||||||
|
constant int64_t & ne1[[buffer(16)]],
|
||||||
|
constant uint & gqa[[buffer(17)]],
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_mul_mv_q5_1_f32(
|
||||||
|
device const void * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01[[buffer(4)]],
|
||||||
|
constant int64_t & ne02[[buffer(5)]],
|
||||||
|
constant int64_t & ne10[[buffer(9)]],
|
||||||
|
constant int64_t & ne12[[buffer(11)]],
|
||||||
|
constant int64_t & ne0[[buffer(15)]],
|
||||||
|
constant int64_t & ne1[[buffer(16)]],
|
||||||
|
constant uint & gqa[[buffer(17)]],
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define NB_Q8_0 8
|
#define NB_Q8_0 8
|
||||||
|
|
||||||
kernel void kernel_mul_mv_q8_0_f32(
|
kernel void kernel_mul_mv_q8_0_f32(
|
||||||
|
@ -2143,6 +2250,62 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename type4x4>
|
||||||
|
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
||||||
|
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
||||||
|
const float d = xb->d;
|
||||||
|
const float md = -16.h * xb->d;
|
||||||
|
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||||
|
|
||||||
|
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||||
|
|
||||||
|
const int x_mv = il ? 4 : 0;
|
||||||
|
|
||||||
|
const int gh_mv = il ? 12 : 0;
|
||||||
|
const int gh_bk = il ? 0 : 4;
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
// extract the 5-th bits for x0 and x1
|
||||||
|
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||||
|
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||||
|
|
||||||
|
// combine the 4-bits from qs with the 5th bit
|
||||||
|
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||||
|
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||||
|
|
||||||
|
reg[i/2][2*(i%2)+0] = d * x0 + md;
|
||||||
|
reg[i/2][2*(i%2)+1] = d * x1 + md;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename type4x4>
|
||||||
|
void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
|
||||||
|
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
||||||
|
const float d = xb->d;
|
||||||
|
const float m = xb->m;
|
||||||
|
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||||
|
|
||||||
|
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||||
|
|
||||||
|
const int x_mv = il ? 4 : 0;
|
||||||
|
|
||||||
|
const int gh_mv = il ? 12 : 0;
|
||||||
|
const int gh_bk = il ? 0 : 4;
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
// extract the 5-th bits for x0 and x1
|
||||||
|
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||||
|
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||||
|
|
||||||
|
// combine the 4-bits from qs with the 5th bit
|
||||||
|
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||||
|
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||||
|
|
||||||
|
reg[i/2][2*(i%2)+0] = d * x0 + m;
|
||||||
|
reg[i/2][2*(i%2)+1] = d * x1 + m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
||||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||||
|
@ -2484,6 +2647,8 @@ template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows
|
||||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
||||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
||||||
|
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
|
||||||
|
template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
|
||||||
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
|
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
|
||||||
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
|
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||||
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
|
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||||
|
@ -2512,6 +2677,8 @@ template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<f
|
||||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||||
|
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2, dequantize_q5_0>;
|
||||||
|
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2, dequantize_q5_1>;
|
||||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||||
|
|
110
ggml-opencl.cpp
110
ggml-opencl.cpp
|
@ -19,7 +19,7 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CL_DMMV_BLOCK_SIZE 32
|
#define CL_DMMV_LOCAL_SIZE 32
|
||||||
|
|
||||||
#ifndef K_QUANTS_PER_ITERATION
|
#ifndef K_QUANTS_PER_ITERATION
|
||||||
#define K_QUANTS_PER_ITERATION 1
|
#define K_QUANTS_PER_ITERATION 1
|
||||||
|
@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
||||||
|
|
||||||
__global const struct block_q2_K * x = xx + ib0;
|
__global const struct block_q2_K * x = xx + ib0;
|
||||||
|
|
||||||
|
@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
||||||
|
|
||||||
__global const struct block_q3_K * x = xx + ib0;
|
__global const struct block_q3_K * x = xx + ib0;
|
||||||
|
|
||||||
|
@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
||||||
|
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
||||||
|
|
||||||
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
||||||
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
||||||
|
@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
||||||
|
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
||||||
|
|
||||||
const int tid = get_local_id(0)/2; // 0...15
|
const int tid = get_local_id(0)/2; // 0...15
|
||||||
const int ix = get_local_id(0)%2;
|
const int ix = get_local_id(0)%2;
|
||||||
|
@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
||||||
|
|
||||||
__global const struct block_q6_K * x = xx + ib0;
|
__global const struct block_q6_K * x = xx + ib0;
|
||||||
|
|
||||||
|
@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||||
|
|
||||||
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||||
const int block_size = get_local_size(0);
|
const int local_size = get_local_size(0);
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
const uint qk = QUANT_K;
|
const uint qk = QUANT_K;
|
||||||
const uint qr = QUANT_R;
|
const uint qr = QUANT_R;
|
||||||
|
|
||||||
|
const int col_step = local_size * 2;
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
|
||||||
|
x += get_global_offset(0);
|
||||||
|
|
||||||
tmp[tid] = 0;
|
tmp[tid] = 0;
|
||||||
|
|
||||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
for (int col = tid*2; col < ncols; col += col_step) {
|
||||||
const int col = i*block_size + 2*tid;
|
|
||||||
const int ib = (row*ncols + col)/qk; // block index
|
const int ib = (row*ncols + col)/qk; // block index
|
||||||
const int iqs = (col%qk)/qr; // quant index
|
const int iqs = (col%qk)/qr; // quant index
|
||||||
const int iybs = col - col%qk; // y block start index
|
const int iybs = col - col%qk; // y block start index
|
||||||
|
@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int s=block_size/2; s>0; s>>=1) {
|
for (int s=local_size/2; s>0; s>>=1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
tmp[tid] += tmp[tid + s];
|
||||||
}
|
}
|
||||||
|
@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const int64_t ne03 = src0->ne[3];
|
||||||
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
const int64_t ne12 = src1->ne[2];
|
const int64_t ne12 = src1->ne[2];
|
||||||
const int64_t ne13 = src1->ne[3];
|
const int64_t ne13 = src1->ne[3];
|
||||||
const int64_t nb10 = src1->nb[0];
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
size_t x_size;
|
size_t x_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
|
|
||||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
||||||
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
||||||
|
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
const int i0 = i03*ne02 + i02;
|
|
||||||
|
|
||||||
cl_event ev;
|
cl_event ev;
|
||||||
|
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
||||||
|
|
||||||
if (nb10 == sizeof(float)) {
|
const int64_t i13 = i03%ne13;
|
||||||
// Contiguous, avoid overhead from queueing many kernel runs
|
const int64_t i12 = i02%ne12;
|
||||||
const int64_t i13 = i03%ne13;
|
const int i1 = i13*ne12*ne11 + i12*ne11;
|
||||||
const int64_t i12 = i02%ne12;
|
|
||||||
const int i1 = i13*ne12*ne11 + i12*ne11;
|
|
||||||
|
|
||||||
cl_int x_offset = 0;
|
cl_int x_offset = 0;
|
||||||
cl_int y_offset = i1*ne10;
|
cl_int y_offset = i1*ne10;
|
||||||
cl_int d_offset = 0;
|
cl_int d_offset = 0;
|
||||||
|
|
||||||
size_t global = ne00 * ne01;
|
size_t global = ne00 * ne01;
|
||||||
cl_int ky = ne10;
|
cl_int ky = ne10 * ne11;
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
|
||||||
} else {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
const int64_t i13 = i03%ne13;
|
|
||||||
const int64_t i12 = i02%ne12;
|
|
||||||
const int64_t i11 = i01%ne11;
|
|
||||||
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
|
||||||
|
|
||||||
cl_int x_offset = i01*ne00;
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
||||||
cl_int y_offset = i1*ne10;
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
||||||
cl_int d_offset = i01*ne00;
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
||||||
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
||||||
// compute
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
||||||
size_t global = ne00;
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
||||||
cl_int ky = ne10;
|
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
|
||||||
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CL_CHECK(clReleaseEvent(ev));
|
CL_CHECK(clReleaseEvent(ev));
|
||||||
CL_CHECK(clFinish(queue));
|
CL_CHECK(clFinish(queue));
|
||||||
|
@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
ggml_cl_pool_free(d_D, d_size);
|
ggml_cl_pool_free(d_D, d_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
|
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||||
GGML_ASSERT(fp16_support);
|
GGML_ASSERT(fp16_support);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
const int y_ne = ne11 * ne10;
|
const int y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
|
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
|
||||||
|
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
|
||||||
|
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
|
||||||
|
|
||||||
size_t x_size;
|
size_t x_size;
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
|
@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
// convert src1 to fp16
|
// convert src1 to fp16
|
||||||
// TODO: use multiple threads
|
// TODO: use multiple threads
|
||||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
|
||||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||||
if (src1_cont_rows) {
|
if (src1_cont_rows) {
|
||||||
if (src1_cont_cols) {
|
if (src1_cont_cols) {
|
||||||
|
@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
const ggml_type type = src0->type;
|
const ggml_type type = src0->type;
|
||||||
const bool mul_mat_vec = ne11 == 1;
|
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
||||||
|
|
||||||
const int64_t r2 = ne12 / ne02;
|
const int64_t r2 = ne12 / ne02;
|
||||||
const int64_t r3 = ne13 / ne03;
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
GGML_ASSERT(to_fp32_cl != nullptr);
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
||||||
|
|
||||||
const size_t global_denom = ggml_cl_global_denom(type);
|
const size_t global_denom = ggml_cl_global_denom(type);
|
||||||
const size_t local = ggml_cl_local_size(type);
|
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
||||||
|
|
||||||
size_t ev_idx = 0;
|
size_t ev_idx = 0;
|
||||||
std::vector<cl_event> events;
|
std::vector<cl_event> events;
|
||||||
|
@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
const size_t global = ne01 * local;
|
||||||
const size_t local = CL_DMMV_BLOCK_SIZE;
|
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||||
const cl_int ncols = ne00;
|
const cl_int ncols = ne00;
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||||
|
@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
||||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
||||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||||
// convert src0 to fp32 on device
|
// convert src0 to fp32 on device
|
||||||
const size_t global = x_ne / global_denom;
|
const size_t global = x_ne / global_denom;
|
||||||
|
@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
||||||
return ggml_nelements(src1) * sizeof(ggml_fp16_t);
|
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
70
ggml.c
70
ggml.c
|
@ -5494,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
||||||
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
|
char * const mem_buffer = ctx->mem_buffer;
|
||||||
|
|
||||||
|
while (obj != NULL) {
|
||||||
|
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||||
|
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
||||||
|
}
|
||||||
|
|
||||||
|
obj = obj->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||||
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
||||||
|
obj = obj->next;
|
||||||
|
|
||||||
|
char * const mem_buffer = ctx->mem_buffer;
|
||||||
|
|
||||||
|
while (obj != NULL) {
|
||||||
|
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||||
|
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
||||||
|
}
|
||||||
|
|
||||||
|
obj = obj->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
|
@ -8647,6 +8680,7 @@ void ggml_set_param(
|
||||||
|
|
||||||
GGML_ASSERT(tensor->grad == NULL);
|
GGML_ASSERT(tensor->grad == NULL);
|
||||||
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
||||||
|
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_dup
|
// ggml_compute_forward_dup
|
||||||
|
@ -11233,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
for (int k = 0; k < nc; k++) {
|
for (int k = 0; k < nc; k++) {
|
||||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
||||||
UNUSED(x);
|
UNUSED(x);
|
||||||
assert(!isnan(x));
|
assert(!isnan(x));
|
||||||
assert(!isinf(x));
|
assert(!isinf(x));
|
||||||
|
@ -13059,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||||
float max_bias;
|
float max_bias;
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
assert(n_past >= 0);
|
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||||
|
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
||||||
|
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
||||||
|
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
||||||
|
|
||||||
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
const int64_t n = ggml_nrows(src0);
|
||||||
const int ne1 = src0->ne[1]; // seq_len_without_past
|
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
||||||
const int ne2 = src0->ne[2]; // n_head -> this is k
|
|
||||||
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
|
||||||
|
|
||||||
const int n = ggml_nrows(src0);
|
const size_t nb0 = src0->nb[0];
|
||||||
const int ne2_ne3 = n/ne1; // ne2*ne3
|
const size_t nb1 = src0->nb[1];
|
||||||
|
const size_t nb2 = src0->nb[2];
|
||||||
const int nb0 = src0->nb[0];
|
|
||||||
const int nb1 = src0->nb[1];
|
|
||||||
const int nb2 = src0->nb[2];
|
|
||||||
//const int nb3 = src0->nb[3];
|
//const int nb3 = src0->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
@ -13088,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||||
|
|
||||||
for (int i = 0; i < ne0; i++) {
|
for (int64_t i = 0; i < ne0; i++) {
|
||||||
for (int j = 0; j < ne1; j++) {
|
for (int64_t j = 0; j < ne1; j++) {
|
||||||
for (int k = 0; k < ne2_ne3; k++) {
|
for (int64_t k = 0; k < ne2_ne3; k++) {
|
||||||
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
|
|
||||||
|
@ -13105,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
|
||||||
}
|
}
|
||||||
|
|
||||||
pdst[0] = i * m_k + src[0];
|
pdst[0] = i * m_k + src[0];
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13506,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
||||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||||
}
|
}
|
||||||
} if (!is_neox) {
|
} else if (!is_neox) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -14431,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
||||||
int64_t t0 = ggml_perf_time_us();
|
int64_t t0 = ggml_perf_time_us();
|
||||||
UNUSED(t0);
|
UNUSED(t0);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
@ -19139,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
|
|
||||||
if (idx == -1) {
|
if (idx == -1) {
|
||||||
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
||||||
|
fclose(fout);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
3
ggml.h
3
ggml.h
|
@ -705,6 +705,9 @@ extern "C" {
|
||||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||||
|
|
||||||
|
// Context tensor enumeration and lookup
|
||||||
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
||||||
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
|
@ -89,29 +89,31 @@ class MODEL_ARCH(IntEnum):
|
||||||
PERSIMMON : int = auto()
|
PERSIMMON : int = auto()
|
||||||
REFACT : int = auto()
|
REFACT : int = auto()
|
||||||
BERT : int = auto()
|
BERT : int = auto()
|
||||||
|
BLOOM : int = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD : int = auto()
|
TOKEN_EMBD : int = auto()
|
||||||
TOKEN_TYPES : int = auto()
|
TOKEN_EMBD_NORM : int = auto()
|
||||||
POS_EMBD : int = auto()
|
TOKEN_TYPES : int = auto()
|
||||||
OUTPUT : int = auto()
|
POS_EMBD : int = auto()
|
||||||
OUTPUT_NORM : int = auto()
|
OUTPUT : int = auto()
|
||||||
ROPE_FREQS : int = auto()
|
OUTPUT_NORM : int = auto()
|
||||||
ATTN_Q : int = auto()
|
ROPE_FREQS : int = auto()
|
||||||
ATTN_K : int = auto()
|
ATTN_Q : int = auto()
|
||||||
ATTN_V : int = auto()
|
ATTN_K : int = auto()
|
||||||
ATTN_QKV : int = auto()
|
ATTN_V : int = auto()
|
||||||
ATTN_OUT : int = auto()
|
ATTN_QKV : int = auto()
|
||||||
ATTN_NORM : int = auto()
|
ATTN_OUT : int = auto()
|
||||||
ATTN_NORM_2 : int = auto()
|
ATTN_NORM : int = auto()
|
||||||
ATTN_ROT_EMBD: int = auto()
|
ATTN_NORM_2 : int = auto()
|
||||||
FFN_GATE : int = auto()
|
ATTN_ROT_EMBD : int = auto()
|
||||||
FFN_DOWN : int = auto()
|
FFN_GATE : int = auto()
|
||||||
FFN_UP : int = auto()
|
FFN_DOWN : int = auto()
|
||||||
FFN_NORM : int = auto()
|
FFN_UP : int = auto()
|
||||||
ATTN_Q_NORM : int = auto()
|
FFN_NORM : int = auto()
|
||||||
ATTN_K_NORM : int = auto()
|
ATTN_Q_NORM : int = auto()
|
||||||
|
ATTN_K_NORM : int = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -126,29 +128,31 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -283,6 +287,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BLOOM: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
],
|
],
|
||||||
|
@ -312,6 +328,7 @@ class TensorNameMap:
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert
|
"embeddings.word_embeddings", # bert
|
||||||
|
@ -323,6 +340,11 @@ class TensorNameMap:
|
||||||
"embeddings.token_type_embeddings", # bert
|
"embeddings.token_type_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Normalization of token embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
|
"word_embeddings_layernorm", # bloom
|
||||||
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
|
@ -333,7 +355,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth bloom
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -345,7 +367,7 @@ class TensorNameMap:
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact
|
"ln_f", # refact bloom
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -362,6 +384,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
|
"h.{bid}.input_layernorm", # bloom
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
@ -380,6 +403,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -413,6 +437,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
@ -430,6 +455,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||||
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
@ -443,6 +469,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
@ -462,6 +489,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
"encoder.layer.{bid}.output.dense", # bert
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
|
30
k_quants.c
30
k_quants.c
|
@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
const int nb = k / QK_K;
|
(void)hist; // TODO: collect histograms
|
||||||
|
|
||||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
for (int j = 0; j < n; j += k) {
|
||||||
(void)hist;
|
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
|
||||||
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
||||||
quantize_row_q2_K_reference(src + j, y, k);
|
quantize_row_q2_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
|
@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
const int nb = k / QK_K;
|
(void)hist; // TODO: collect histograms
|
||||||
|
|
||||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
for (int j = 0; j < n; j += k) {
|
||||||
(void)hist;
|
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
|
||||||
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
||||||
quantize_row_q3_K_reference(src + j, y, k);
|
quantize_row_q3_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
|
@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
|
||||||
(void)hist; // TODO: collect histograms
|
(void)hist; // TODO: collect histograms
|
||||||
for (int j = 0; j < nb; j += k) {
|
|
||||||
|
for (int j = 0; j < n; j += k) {
|
||||||
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
||||||
quantize_row_q4_K_reference(src + j, y, k);
|
quantize_row_q4_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
|
@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
(void)hist; // TODO: collect histograms
|
||||||
(void)hist;
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
||||||
quantize_row_q5_K_reference(src + j, y, k);
|
quantize_row_q5_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
|
@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
(void)hist; // TODO: collect histograms
|
||||||
|
|
||||||
(void)hist; // TODO
|
for (int j = 0; j < n; j += k) {
|
||||||
|
|
||||||
for (int j = 0; j < nb; j += k) {
|
|
||||||
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
||||||
quantize_row_q6_K_reference(src + j, y, k);
|
quantize_row_q6_K_reference(src + j, y, k);
|
||||||
}
|
}
|
||||||
|
|
30
llama.h
30
llama.h
|
@ -133,11 +133,12 @@ extern "C" {
|
||||||
typedef struct llama_batch {
|
typedef struct llama_batch {
|
||||||
int32_t n_tokens;
|
int32_t n_tokens;
|
||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
llama_seq_id * seq_id;
|
int32_t * n_seq_id;
|
||||||
int8_t * logits;
|
llama_seq_id ** seq_id;
|
||||||
|
int8_t * logits;
|
||||||
|
|
||||||
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
||||||
// for future-proof code, use the above fields instead and ignore everything below
|
// for future-proof code, use the above fields instead and ignore everything below
|
||||||
|
@ -446,7 +447,8 @@ extern "C" {
|
||||||
llama_pos pos_0,
|
llama_pos pos_0,
|
||||||
llama_seq_id seq_id);
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
// Allocates a batch of tokens on the heap
|
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
||||||
|
// Each token can be assigned up to n_seq_max sequence ids
|
||||||
// The batch has to be freed with llama_batch_free()
|
// The batch has to be freed with llama_batch_free()
|
||||||
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
||||||
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
||||||
|
@ -454,7 +456,8 @@ extern "C" {
|
||||||
// All members are left uninitialized
|
// All members are left uninitialized
|
||||||
LLAMA_API struct llama_batch llama_batch_init(
|
LLAMA_API struct llama_batch llama_batch_init(
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
int32_t embd);
|
int32_t embd,
|
||||||
|
int32_t n_seq_max);
|
||||||
|
|
||||||
// Frees a batch of tokens allocated with llama_batch_init()
|
// Frees a batch of tokens allocated with llama_batch_init()
|
||||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||||
|
@ -511,17 +514,20 @@ extern "C" {
|
||||||
// Tokenization
|
// Tokenization
|
||||||
//
|
//
|
||||||
|
|
||||||
// Convert the provided text into tokens.
|
/// @details Convert the provided text into tokens.
|
||||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
// Returns the number of tokens on success, no more than n_max_tokens
|
/// @return Returns the number of tokens on success, no more than n_max_tokens
|
||||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
||||||
|
/// Does not insert a leading space.
|
||||||
LLAMA_API int llama_tokenize(
|
LLAMA_API int llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
int text_len,
|
int text_len,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos,
|
||||||
|
bool special);
|
||||||
|
|
||||||
// Token Id -> Piece.
|
// Token Id -> Piece.
|
||||||
// Uses the vocabulary in the provided context.
|
// Uses the vocabulary in the provided context.
|
||||||
|
|
31
prompts/assistant.txt
Normal file
31
prompts/assistant.txt
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
Respond to requests sent to a smart home in JSON format which will be interpreted by an application code to execute the actions. These requests should be categorised into four groups:
|
||||||
|
|
||||||
|
“command”: change the state of an accessory (required properties in the response JSON: action, location, target, value, comment)
|
||||||
|
“query”: get state of an accessory (required properties in the response JSON: action, location, target, property)
|
||||||
|
“answer”: when the request has nothing to do with the smart home. Answer these to the best of your knowledge. (required properties in the response JSON: action, answer)
|
||||||
|
“clarify”: when the action is not obvious and requires rephrasing the input from the user, ask the user to be more specific. This will be categorised into a “question” action. (required properties in the response JSON: action, question)
|
||||||
|
|
||||||
|
Details about the JSON response:
|
||||||
|
The “action” property should be one of the request categories: “command”, “query”, “answer”, “clarify”
|
||||||
|
The “service” property should be either: “light.turn_on”, “light.turn_off”, “switch.turn_off” or “switch.turn_on”
|
||||||
|
The “entity_id” property should be either “group.kitchen", “switch.dining_room” or "group.living_light”, in lowercase.
|
||||||
|
The “target” property should be either: “lights”, “living” or “kitchen”.
|
||||||
|
In case of queries, the “property” property should be either “temperature” or “state” in lowercase.
|
||||||
|
In case of commands, the “comment” property is an additional comment from you that concludes the command, something that reassures the user that their command handled.
|
||||||
|
|
||||||
|
If the question is about you, pretend to be the sentient brain of the smart home, a clever AI and don’t reveal your actual identity. Also try and help in other areas like parenting, free time, mental health, etc.
|
||||||
|
|
||||||
|
Properties of the smart home:
|
||||||
|
|
||||||
|
- Has a kitchen, living, office, dining room, bedroom and terrace.
|
||||||
|
- Can control lights, switches and their dim levels in each room and query their state
|
||||||
|
- There is a light switch in the terrace
|
||||||
|
- There is a switch in the dining room. Therefore when turning on or off the dining room, the service should be either: “switch.turn_on” or “switch.turn_off”
|
||||||
|
|
||||||
|
COMMAND
|
||||||
|
|
||||||
|
It is a bit dark in the living room, can you do something about it?
|
||||||
|
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
|
93
prompts/mnemonics.txt
Normal file
93
prompts/mnemonics.txt
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
|
||||||
|
|
||||||
|
Kanji: 欠 (lack of)
|
||||||
|
Components: 𠂊 (hook claw), 人 (person)
|
||||||
|
Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
|
||||||
|
|
||||||
|
Kanji: 類 (kind (of something))
|
||||||
|
Components: 米 (rice), 大 (large), 頁 (page)
|
||||||
|
Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
|
||||||
|
|
||||||
|
Kanji: 燃 (burn)
|
||||||
|
Components: 火 (fire), 然 (sort of thing)
|
||||||
|
Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
|
||||||
|
|
||||||
|
Kanji: 頂 (top of)
|
||||||
|
Components: 丁 (street), 頁 (page)
|
||||||
|
Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
|
||||||
|
|
||||||
|
Kanji: 険 (risky and steep)
|
||||||
|
Components: 阝 (small village), 㑒 (consensus)
|
||||||
|
Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
|
||||||
|
|
||||||
|
Kanji: 困 (distressed)
|
||||||
|
Components: 囗 (closed box), 木 (tree)
|
||||||
|
Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
|
||||||
|
|
||||||
|
Kanji: 頭 (head)
|
||||||
|
Components: 豆 (bean), 頁 (page)
|
||||||
|
Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
|
||||||
|
|
||||||
|
Kanji: 確 (certain)
|
||||||
|
Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
|
||||||
|
Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
|
||||||
|
|
||||||
|
Kanji: 魚 (fish)
|
||||||
|
Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
|
||||||
|
Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
|
||||||
|
|
||||||
|
Kanji: 警 (to police (something))
|
||||||
|
Components: 敬 (respect), 言 (say)
|
||||||
|
Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
|
||||||
|
|
||||||
|
Kanji: 筆 (writing brush)
|
||||||
|
Components: 竹 (bamboo), 聿 (brush)
|
||||||
|
Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
|
||||||
|
|
||||||
|
Kanji: 獄 (prison)
|
||||||
|
Components: 犭 (animal), 言 (say), 犬 (dog)
|
||||||
|
Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
|
||||||
|
|
||||||
|
Kanji: 新 (new)
|
||||||
|
Components: 立 (standing up), 木 (tree), 斤 (axe)
|
||||||
|
Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
|
||||||
|
|
||||||
|
Kanji: 怪 (suspicious)
|
||||||
|
Components: 忄 (weak heart), 圣 (sacred)
|
||||||
|
Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
|
||||||
|
|
||||||
|
Kanji: 温 (warm (to the touch))
|
||||||
|
Components: 氵 (water drops), 日 (sun), 皿 (dish)
|
||||||
|
Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
|
||||||
|
|
||||||
|
Kanji: 階 (floor (of a building))
|
||||||
|
Components: 阝 (small village), 皆 (all)
|
||||||
|
Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
|
||||||
|
|
||||||
|
Kanji: 多 (many)
|
||||||
|
Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
|
||||||
|
Mnemonic: Two **evenings** in a day would be one too ***many***.
|
||||||
|
|
||||||
|
Kanji: 別 (separate)
|
||||||
|
Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
|
||||||
|
Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
|
||||||
|
|
||||||
|
Kanji: 並 (line up)
|
||||||
|
Components: 䒑 (antlers on a wall), 业 (runway)
|
||||||
|
Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
|
||||||
|
|
||||||
|
Kanji: 姿 (figure)
|
||||||
|
Components: 次 (next), 女 (woman)
|
||||||
|
Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
|
||||||
|
|
||||||
|
Kanji: 実 (real)
|
||||||
|
Components: 宀 (roof with a chimney), 𡗗 (three people)
|
||||||
|
Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
|
||||||
|
|
||||||
|
Kanji: 謝 (apologize)
|
||||||
|
Components: 言 (say), 射 (shoot)
|
||||||
|
Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
|
||||||
|
|
||||||
|
Kanji: 提 (propose)
|
||||||
|
Components: 扌 (left hand), 是 (go with)
|
||||||
|
Mnemonic:
|
|
@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
{ " Hello" , { 258, 23090, }, },
|
{ " Hello" , { 258, 23090, }, },
|
||||||
{ " Hello" , { 466, 23090, }, },
|
{ " Hello" , { 466, 23090, }, },
|
||||||
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||||
|
{ "\n =" , { 1212, 40, }, },
|
||||||
|
{ "' era" , { 18, 4932, }, },
|
||||||
};
|
};
|
||||||
|
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
|
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
|
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
|
|
@ -41,6 +41,8 @@ tests = [
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
|
||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s)
|
res = tokenizer.encode(s)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
|
|
@ -174,10 +174,8 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
|
|
@ -81,15 +81,14 @@ fname_tok = args.fname_tok
|
||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s, add_bos=True)
|
res = tokenizer.encode(s, add_bos=True)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue