Merge master and fix conflicts
This commit is contained in:
commit
1403d87cca
27 changed files with 1157 additions and 362 deletions
5
.github/workflows/build.yml
vendored
5
.github/workflows/build.yml
vendored
|
@ -276,6 +276,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||||
|
|
||||||
|
- name: Build Swift Example
|
||||||
|
id: make_build_swift_example
|
||||||
|
run: |
|
||||||
|
make swift
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -56,6 +56,7 @@ models-mnt
|
||||||
/server
|
/server
|
||||||
/simple
|
/simple
|
||||||
/batched
|
/batched
|
||||||
|
/batched-bench
|
||||||
/export-lora
|
/export-lora
|
||||||
/finetune
|
/finetune
|
||||||
/speculative
|
/speculative
|
||||||
|
|
108
Makefile
108
Makefile
|
@ -1,8 +1,14 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
BUILD_TARGETS = \
|
||||||
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
|
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
|
||||||
|
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
TEST_TARGETS = \
|
||||||
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -172,6 +178,24 @@ else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_THREAD
|
||||||
|
MK_CFLAGS += -fsanitize=thread -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=thread -g
|
||||||
|
MK_LDFLAGS += -fsanitize=thread -g
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_ADDRESS
|
||||||
|
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SANITIZE_UNDEFINED
|
||||||
|
MK_CFLAGS += -fsanitize=undefined -g
|
||||||
|
MK_CXXFLAGS += -fsanitize=undefined -g
|
||||||
|
MK_LDFLAGS += -fsanitize=undefined -g
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_VERBOSE
|
ifdef LLAMA_SERVER_VERBOSE
|
||||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||||
endif
|
endif
|
||||||
|
@ -520,7 +544,13 @@ OBJS += ggml-alloc.o ggml-backend.o
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
||||||
|
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o
|
||||||
|
|
||||||
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
console.o: common/console.cpp common/console.h
|
console.o: common/console.cpp common/console.h
|
||||||
|
@ -542,19 +572,22 @@ clean:
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
|
|
||||||
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
|
@ -563,56 +596,56 @@ quantize: examples/quantize/quantize.cpp build-info.h ggml.
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h examples/llava/stb_image.h ggml.o llama.o common.o $(OBJS)
|
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h examples/llava/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
|
@ -620,6 +653,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(UNAME_S),Darwin)
|
||||||
|
swift: examples/batched.swift
|
||||||
|
(cd examples/batched.swift; make build)
|
||||||
|
endif
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
@ -640,7 +678,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: benchmark-matmult
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
@ -648,40 +686,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
|
@ -128,17 +128,18 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,8 @@ set(TARGET common)
|
||||||
add_library(${TARGET} OBJECT
|
add_library(${TARGET} OBJECT
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
sampling.h
|
||||||
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
grammar-parser.h
|
||||||
|
|
|
@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_k = std::stoi(argv[i]);
|
sparams.top_k = std::stoi(argv[i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx-size") {
|
} else if (arg == "-c" || arg == "--ctx-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.temp = std::stof(argv[i]);
|
sparams.temp = std::stof(argv[i]);
|
||||||
} else if (arg == "--tfs") {
|
} else if (arg == "--tfs") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.tfs_z = std::stof(argv[i]);
|
sparams.tfs_z = std::stof(argv[i]);
|
||||||
} else if (arg == "--typical") {
|
} else if (arg == "--typical") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.typical_p = std::stof(argv[i]);
|
sparams.typical_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--repeat-last-n") {
|
} else if (arg == "--repeat-last-n") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_last_n = std::stoi(argv[i]);
|
sparams.repeat_last_n = std::stoi(argv[i]);
|
||||||
} else if (arg == "--repeat-penalty") {
|
} else if (arg == "--repeat-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_penalty = std::stof(argv[i]);
|
sparams.repeat_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--frequency-penalty") {
|
} else if (arg == "--frequency-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.frequency_penalty = std::stof(argv[i]);
|
sparams.frequency_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--presence-penalty") {
|
} else if (arg == "--presence-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.presence_penalty = std::stof(argv[i]);
|
sparams.presence_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat") {
|
} else if (arg == "--mirostat") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat = std::stoi(argv[i]);
|
sparams.mirostat = std::stoi(argv[i]);
|
||||||
} else if (arg == "--mirostat-lr") {
|
} else if (arg == "--mirostat-lr") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_eta = std::stof(argv[i]);
|
sparams.mirostat_eta = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat-ent") {
|
} else if (arg == "--mirostat-ent") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_tau = std::stof(argv[i]);
|
sparams.mirostat_tau = std::stof(argv[i]);
|
||||||
} else if (arg == "--cfg-negative-prompt") {
|
} else if (arg == "--cfg-negative-prompt") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_negative_prompt = argv[i];
|
sparams.cfg_negative_prompt = argv[i];
|
||||||
} else if (arg == "--cfg-negative-prompt-file") {
|
} else if (arg == "--cfg-negative-prompt-file") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
||||||
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
||||||
params.cfg_negative_prompt.pop_back();
|
sparams.cfg_negative_prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "--cfg-scale") {
|
} else if (arg == "--cfg-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_scale = std::stof(argv[i]);
|
sparams.cfg_scale = std::stof(argv[i]);
|
||||||
} else if (arg == "-b" || arg == "--batch-size") {
|
} else if (arg == "-b" || arg == "--batch-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--no-penalize-nl") {
|
} else if (arg == "--no-penalize-nl") {
|
||||||
params.penalize_nl = false;
|
sparams.penalize_nl = false;
|
||||||
} else if (arg == "-l" || arg == "--logit-bias") {
|
} else if (arg == "-l" || arg == "--logit-bias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -536,7 +537,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string value_str;
|
std::string value_str;
|
||||||
try {
|
try {
|
||||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||||
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||||
} else {
|
} else {
|
||||||
throw std::exception();
|
throw std::exception();
|
||||||
}
|
}
|
||||||
|
@ -639,6 +640,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
|
@ -671,19 +674,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
||||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
||||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
||||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
||||||
printf(" --mirostat N use Mirostat sampling.\n");
|
printf(" --mirostat N use Mirostat sampling.\n");
|
||||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||||
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
|
||||||
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
|
||||||
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||||
printf(" modifies the likelihood of token appearing in the completion,\n");
|
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||||
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||||
|
@ -694,7 +697,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" negative prompt to use for guidance. (default: empty)\n");
|
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||||
|
@ -702,7 +705,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||||
|
@ -854,7 +857,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
if (params.ignore_eos) {
|
||||||
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -946,127 +949,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx) {
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
|
|
||||||
llama_token id = 0;
|
|
||||||
|
|
||||||
float * logits = llama_get_logits_ith(ctx, idx);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
candidates.clear();
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply penalties
|
|
||||||
if (!last_tokens.empty()) {
|
|
||||||
const float nl_logit = logits[llama_token_nl(ctx)];
|
|
||||||
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
|
|
||||||
llama_sample_repetition_penalty(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
|
|
||||||
if (!penalize_nl) {
|
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
|
||||||
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
|
||||||
cur_p.data[idx].logit = nl_logit;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_sample_grammar(ctx, &cur_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0) {
|
|
||||||
// Greedy sampling
|
|
||||||
id = llama_sample_token_greedy(ctx, &cur_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
} else {
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, params.n_probs);
|
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
|
|
||||||
{
|
|
||||||
const int n_top = 10;
|
|
||||||
LOG("top %d candidates:\n", n_top);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_top; i++) {
|
|
||||||
const llama_token id = cur_p.data[i].id;
|
|
||||||
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
id = llama_sample_token(ctx, &cur_p);
|
|
||||||
|
|
||||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// printf("`%d`", candidates_p.size);
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
@ -1218,6 +1100,8 @@ std::string get_sortable_timestamp() {
|
||||||
|
|
||||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
|
@ -1264,21 +1148,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
|
|
||||||
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
||||||
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
||||||
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
||||||
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
||||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
||||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
||||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||||
|
|
||||||
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
||||||
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||||
|
|
||||||
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
||||||
|
@ -1291,7 +1175,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||||
|
|
||||||
fprintf(stream, "logit_bias:\n");
|
fprintf(stream, "logit_bias:\n");
|
||||||
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
||||||
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1315,30 +1199,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||||
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
||||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||||
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
||||||
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
||||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
||||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
||||||
|
|
||||||
fprintf(stream, "reverse_prompt:\n");
|
fprintf(stream, "reverse_prompt:\n");
|
||||||
for (std::string ap : params.antiprompt) {
|
for (std::string ap : params.antiprompt) {
|
||||||
|
@ -1356,15 +1240,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
@ -49,31 +51,12 @@ struct gpt_params {
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// sampling parameters
|
// // sampling parameters
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
struct llama_sampling_params sampling_params;
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
|
||||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
|
||||||
|
|
||||||
// Classifier-Free Guidance
|
|
||||||
// https://arxiv.org/abs/2306.17806
|
|
||||||
std::string cfg_negative_prompt; // string to help guidance
|
|
||||||
float cfg_scale = 1.f; // How strong is guidance
|
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
|
@ -115,7 +98,6 @@ struct gpt_params {
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
@ -184,36 +166,6 @@ std::string llama_detokenize_bpe(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens);
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
|
||||||
//
|
|
||||||
// required:
|
|
||||||
// - ctx: context to use for sampling
|
|
||||||
// - params: sampling parameters
|
|
||||||
//
|
|
||||||
// optional:
|
|
||||||
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
|
||||||
// - grammar: grammar to use for sampling, ignore if NULL
|
|
||||||
// - last_tokens: needed for repetition penalty, ignore if empty
|
|
||||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
|
||||||
//
|
|
||||||
// returns:
|
|
||||||
// - token: sampled token
|
|
||||||
// - candidates: vector of candidate tokens
|
|
||||||
//
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx = 0);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
|
166
common/sampling.cpp
Normal file
166
common/sampling.cpp
Normal file
|
@ -0,0 +1,166 @@
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
llama_sampling_context::~llama_sampling_context() {
|
||||||
|
for (auto & it : sequence_contexts) {
|
||||||
|
if (it.second.grammar != NULL) {
|
||||||
|
llama_grammar_free(it.second.grammar);
|
||||||
|
it.second.grammar = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampling_context llama_sampling_context_init(
|
||||||
|
const struct gpt_params & params,
|
||||||
|
llama_grammar * grammar) {
|
||||||
|
llama_sampling_context result;
|
||||||
|
|
||||||
|
result.params = params.sampling_params;
|
||||||
|
result.grammar = grammar;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Creates the context if it doesn't exist, so this always return something.
|
||||||
|
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq) {
|
||||||
|
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
||||||
|
if (it != ctx_sampling.sequence_contexts.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
llama_sampler_sequence_context new_ctx = {
|
||||||
|
2.0f * ctx_sampling.params.mirostat_tau,
|
||||||
|
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
|
||||||
|
};
|
||||||
|
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_sampling_context_reset(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq) {
|
||||||
|
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
||||||
|
if (it == ctx_sampling.sequence_contexts.end()) return false;
|
||||||
|
if (it->second.grammar != NULL) {
|
||||||
|
llama_grammar_free(it->second.grammar);
|
||||||
|
it->second.grammar = NULL;
|
||||||
|
}
|
||||||
|
ctx_sampling.sequence_contexts.erase(it);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
struct llama_context * ctx_guidance,
|
||||||
|
struct llama_sampling_context & ctx_sampling,
|
||||||
|
const std::vector<llama_token> & last_tokens,
|
||||||
|
std::vector<llama_token_data> & candidates,
|
||||||
|
const int idx,
|
||||||
|
llama_seq_id seq) {
|
||||||
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
|
const llama_sampling_params & params = ctx_sampling.params;
|
||||||
|
const float temp = params.temp;
|
||||||
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
|
const float top_p = params.top_p;
|
||||||
|
const float tfs_z = params.tfs_z;
|
||||||
|
const float typical_p = params.typical_p;
|
||||||
|
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
|
const float repeat_penalty = params.repeat_penalty;
|
||||||
|
const float alpha_presence = params.presence_penalty;
|
||||||
|
const float alpha_frequency = params.frequency_penalty;
|
||||||
|
const int mirostat = params.mirostat;
|
||||||
|
const float mirostat_tau = params.mirostat_tau;
|
||||||
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
float * logits = llama_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
|
// Apply params.logit_bias map
|
||||||
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates.clear();
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
if (ctx_guidance) {
|
||||||
|
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply penalties
|
||||||
|
if (!last_tokens.empty()) {
|
||||||
|
const float nl_logit = logits[llama_token_nl(ctx)];
|
||||||
|
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
||||||
|
|
||||||
|
llama_sample_repetition_penalty(ctx, &cur_p,
|
||||||
|
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||||
|
last_n_repeat, repeat_penalty);
|
||||||
|
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
||||||
|
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||||
|
last_n_repeat, alpha_frequency, alpha_presence);
|
||||||
|
|
||||||
|
if (!penalize_nl) {
|
||||||
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
||||||
|
cur_p.data[idx].logit = nl_logit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
|
||||||
|
|
||||||
|
if (ctx_seq.grammar != NULL) {
|
||||||
|
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// Greedy sampling
|
||||||
|
id = llama_sample_token_greedy(ctx, &cur_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// Temperature sampling
|
||||||
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
|
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
||||||
|
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
||||||
|
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
|
||||||
|
{
|
||||||
|
const int n_top = 10;
|
||||||
|
LOG("top %d candidates:\n", n_top);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_top; i++) {
|
||||||
|
const llama_token id = cur_p.data[i].id;
|
||||||
|
(void)id; // To avoid a warning that id is unused when logging is disabled.
|
||||||
|
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
id = llama_sample_token(ctx, &cur_p);
|
||||||
|
|
||||||
|
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_seq.grammar != NULL) {
|
||||||
|
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
108
common/sampling.h
Normal file
108
common/sampling.h
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampling parameters
|
||||||
|
typedef struct llama_sampling_params {
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||||
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
|
||||||
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
|
|
||||||
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
|
||||||
|
// Classifier-Free Guidance
|
||||||
|
// https://arxiv.org/abs/2306.17806
|
||||||
|
std::string cfg_negative_prompt; // string to help guidance
|
||||||
|
float cfg_scale = 1.f; // How strong is guidance
|
||||||
|
|
||||||
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
} llama_sampling_params;
|
||||||
|
|
||||||
|
// per-sequence sampler context
|
||||||
|
typedef struct llama_sampler_sequence_context {
|
||||||
|
float mirostat_mu; // mirostat sampler state
|
||||||
|
llama_grammar * grammar;
|
||||||
|
} llama_sampler_sequence_context;
|
||||||
|
|
||||||
|
// general sampler context
|
||||||
|
typedef struct llama_sampling_context {
|
||||||
|
~llama_sampling_context();
|
||||||
|
|
||||||
|
// parameters that will be used for sampling and when creating
|
||||||
|
// new llama_sampler_sequence_context instances
|
||||||
|
llama_sampling_params params;
|
||||||
|
|
||||||
|
// map of sequence ids to sampler contexts
|
||||||
|
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
|
||||||
|
|
||||||
|
// when non-NULL, new instances of llama_sampler_sequence_context
|
||||||
|
// will get a copy of the grammar here
|
||||||
|
// note: only the pointer is stored here, it is not a copy of
|
||||||
|
// the grammar and shouldn't be freed
|
||||||
|
llama_grammar * grammar;
|
||||||
|
} llama_sampling_context;
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
// Create a new sampling context instance.
|
||||||
|
llama_sampling_context llama_sampling_context_init(
|
||||||
|
const struct gpt_params & params,
|
||||||
|
llama_grammar * grammar = NULL);
|
||||||
|
|
||||||
|
// Fetches the sampler context for the specified sequence id (defaults to 0).
|
||||||
|
// If the context for that sequence id doesn't already exist, it will be created with
|
||||||
|
// default values based on the parameters in the ctx_sampling argument.
|
||||||
|
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq = 0);
|
||||||
|
|
||||||
|
// Reset the sampler context for the supplied sequence id (defaults to 0).
|
||||||
|
// This is necessary to reuse a sequence id or free memory used by sequences
|
||||||
|
// that are no longer required.
|
||||||
|
bool llama_sampling_context_reset(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq = 0);
|
||||||
|
|
||||||
|
// this is a common sampling function used across the examples for convenience
|
||||||
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
// llama_sampling_context_reset when a sequence ends
|
||||||
|
//
|
||||||
|
// required:
|
||||||
|
// - ctx: context to use for sampling
|
||||||
|
// - ctx_sampling: sampling-specific context
|
||||||
|
//
|
||||||
|
// optional:
|
||||||
|
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
||||||
|
// - last_tokens: needed for repetition penalty, ignore if empty
|
||||||
|
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
||||||
|
// - seq: sequence id to associate sampler state with
|
||||||
|
//
|
||||||
|
// returns:
|
||||||
|
// - token: sampled token
|
||||||
|
// - candidates: vector of candidate tokens
|
||||||
|
//
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
struct llama_context * ctx_guidance,
|
||||||
|
struct llama_sampling_context & ctx_sampling,
|
||||||
|
const std::vector<llama_token> & last_tokens,
|
||||||
|
std::vector<llama_token_data> & candidates,
|
||||||
|
const int idx = 0,
|
||||||
|
llama_seq_id seq = 0);
|
|
@ -25,6 +25,7 @@ else()
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
|
|
5
examples/batched-bench/CMakeLists.txt
Normal file
5
examples/batched-bench/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET batched-bench)
|
||||||
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
51
examples/batched-bench/README.md
Normal file
51
examples/batched-bench/README.md
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# llama.cpp/example/batched-bench
|
||||||
|
|
||||||
|
Benchmark the batched decoding performance of `llama.cpp`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
There are 2 modes of operation:
|
||||||
|
|
||||||
|
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
|
||||||
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||||
|
|
||||||
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
||||||
|
|
||||||
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
||||||
|
|
||||||
|
# custom set of batches
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample results
|
||||||
|
|
||||||
|
- `PP` - prompt tokens per batch
|
||||||
|
- `TG` - generated tokens per batch
|
||||||
|
- `B` - number of batches
|
||||||
|
- `N_KV` - required KV cache size
|
||||||
|
- `T_PP` - prompt processing time (i.e. time to first token)
|
||||||
|
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
|
||||||
|
- `T_TG` - time to generate all batches
|
||||||
|
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
|
||||||
|
- `T` - total time
|
||||||
|
- `S` - total speed (i.e. all tokens / total time)
|
||||||
|
|
||||||
|
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||||
|
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||||
|
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
|
||||||
|
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
|
||||||
|
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
|
||||||
|
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
|
||||||
|
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
|
||||||
|
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
|
||||||
|
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
|
||||||
|
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
|
||||||
|
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
|
||||||
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
251
examples/batched-bench/batched-bench.cpp
Normal file
251
examples/batched-bench/batched-bench.cpp
Normal file
|
@ -0,0 +1,251 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// mutates the input string
|
||||||
|
static std::vector<int> parse_list(char * p) {
|
||||||
|
std::vector<int> ret;
|
||||||
|
|
||||||
|
char * q = p;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
if (*p == ',') {
|
||||||
|
*p = '\0';
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
q = p + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (argc == 1 || argv[1][0] == '-') {
|
||||||
|
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
|
||||||
|
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||||
|
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_kv_max = 2048;
|
||||||
|
int is_pp_shared = 0;
|
||||||
|
int n_gpu_layers = 0;
|
||||||
|
int mmq = 0;
|
||||||
|
|
||||||
|
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
||||||
|
std::vector<int> n_tg = { 128, 256, };
|
||||||
|
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
||||||
|
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
||||||
|
|
||||||
|
if (argc >= 2) {
|
||||||
|
params.model = argv[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 3) {
|
||||||
|
n_kv_max = std::atoi(argv[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 4) {
|
||||||
|
is_pp_shared = std::atoi(argv[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 5) {
|
||||||
|
n_gpu_layers = std::atoi(argv[4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 6) {
|
||||||
|
mmq = std::atoi(argv[5]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 7) {
|
||||||
|
n_pp = parse_list(argv[6]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 8) {
|
||||||
|
n_tg = parse_list(argv[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 9) {
|
||||||
|
n_pl = parse_list(argv[8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
// initialize the model
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
|
model_params.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
|
ctx_params.seed = 1234;
|
||||||
|
ctx_params.n_ctx = n_kv_max;
|
||||||
|
ctx_params.n_batch = 512;
|
||||||
|
ctx_params.mul_mat_q = mmq;
|
||||||
|
|
||||||
|
ctx_params.n_threads = params.n_threads;
|
||||||
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(n_kv_max, 0);
|
||||||
|
|
||||||
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
|
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||||
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
|
llama_batch batch_view = {
|
||||||
|
n_tokens,
|
||||||
|
batch.token + i,
|
||||||
|
nullptr,
|
||||||
|
batch.pos + i,
|
||||||
|
batch.seq_id + i,
|
||||||
|
batch.logits + i,
|
||||||
|
0, 0, 0, // unused
|
||||||
|
};
|
||||||
|
|
||||||
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
// warm up
|
||||||
|
{
|
||||||
|
batch.n_tokens = 16;
|
||||||
|
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
batch.token[i] = 0;
|
||||||
|
batch.pos[i] = i;
|
||||||
|
batch.seq_id[i] = 0;
|
||||||
|
batch.logits[i] = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
|
||||||
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
|
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
|
||||||
|
const int pp = n_pp[i_pp];
|
||||||
|
const int tg = n_tg[i_tg];
|
||||||
|
const int pl = n_pl[i_pl];
|
||||||
|
|
||||||
|
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
|
||||||
|
|
||||||
|
if (n_ctx_req > n_kv_max) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
batch.n_tokens = is_pp_shared ? pp : pl*pp;
|
||||||
|
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
batch.token[i] = 0;
|
||||||
|
batch.pos[i] = i;
|
||||||
|
batch.seq_id[i] = 0;
|
||||||
|
batch.logits[i] = false;
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
|
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_pp_shared) {
|
||||||
|
for (int32_t i = 1; i < pl; ++i) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_pp_end = ggml_time_us();
|
||||||
|
|
||||||
|
const auto t_tg_start = ggml_time_us();
|
||||||
|
|
||||||
|
for (int i = 0; i < tg; ++i) {
|
||||||
|
batch.n_tokens = pl;
|
||||||
|
|
||||||
|
for (int j = 0; j < pl; ++j) {
|
||||||
|
batch.token[j] = 0;
|
||||||
|
batch.pos[j] = pp + i;
|
||||||
|
batch.seq_id[j] = j;
|
||||||
|
batch.logits[j] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_tg_end = ggml_time_us();
|
||||||
|
|
||||||
|
const int32_t n_kv = n_ctx_req;
|
||||||
|
|
||||||
|
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
|
||||||
|
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
|
||||||
|
const float t = t_pp + t_tg;
|
||||||
|
|
||||||
|
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
|
||||||
|
const float speed_tg = pl*tg / t_tg;
|
||||||
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
9
examples/batched.swift/.gitignore
vendored
Normal file
9
examples/batched.swift/.gitignore
vendored
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
.DS_Store
|
||||||
|
/.build
|
||||||
|
/Packages
|
||||||
|
xcuserdata/
|
||||||
|
DerivedData/
|
||||||
|
.swiftpm/configuration/registries.json
|
||||||
|
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||||
|
.netrc
|
||||||
|
batched_swift
|
6
examples/batched.swift/Makefile
Executable file
6
examples/batched.swift/Makefile
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
.PHONY: build
|
||||||
|
|
||||||
|
build:
|
||||||
|
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
|
rm -f ./batched_swift
|
||||||
|
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
22
examples/batched.swift/Package.swift
Normal file
22
examples/batched.swift/Package.swift
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
// swift-tools-version: 5.5
|
||||||
|
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||||
|
|
||||||
|
import PackageDescription
|
||||||
|
|
||||||
|
let package = Package(
|
||||||
|
name: "batched_swift",
|
||||||
|
platforms: [.macOS(.v12)],
|
||||||
|
dependencies: [
|
||||||
|
.package(name: "llama", path: "../../"),
|
||||||
|
],
|
||||||
|
targets: [
|
||||||
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
|
.executableTarget(
|
||||||
|
name: "batched_swift",
|
||||||
|
dependencies: ["llama"],
|
||||||
|
path: "Sources",
|
||||||
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
4
examples/batched.swift/README.md
Normal file
4
examples/batched.swift/README.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
|
$ `make`
|
||||||
|
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
255
examples/batched.swift/Sources/main.swift
Normal file
255
examples/batched.swift/Sources/main.swift
Normal file
|
@ -0,0 +1,255 @@
|
||||||
|
import Foundation
|
||||||
|
import llama
|
||||||
|
|
||||||
|
let arguments = CommandLine.arguments
|
||||||
|
|
||||||
|
// Check that we have at least one argument (the model path)
|
||||||
|
guard arguments.count > 1 else {
|
||||||
|
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
let modelPath: String = arguments[1]
|
||||||
|
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
|
||||||
|
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
|
||||||
|
|
||||||
|
// total length of the sequences including the prompt
|
||||||
|
let n_len: Int = 32
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
llama_backend_init(false)
|
||||||
|
defer {
|
||||||
|
llama_backend_free()
|
||||||
|
}
|
||||||
|
|
||||||
|
let model_params = llama_model_default_params()
|
||||||
|
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
|
||||||
|
print("Failed to load model")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free_model(model)
|
||||||
|
}
|
||||||
|
|
||||||
|
var tokens = tokenize(text: prompt, add_bos: true)
|
||||||
|
|
||||||
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
|
var context_params = llama_context_default_params()
|
||||||
|
context_params.seed = 1234
|
||||||
|
context_params.n_ctx = n_kv_req
|
||||||
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
|
context_params.n_threads = 8
|
||||||
|
context_params.n_threads_batch = 8
|
||||||
|
|
||||||
|
let context = llama_new_context_with_model(model, context_params)
|
||||||
|
guard context != nil else {
|
||||||
|
print("Failed to initialize context")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free(context)
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
|
|
||||||
|
if n_kv_req > n_ctx {
|
||||||
|
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buffer: [CChar] = []
|
||||||
|
for id: llama_token in tokens {
|
||||||
|
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
|
||||||
|
defer {
|
||||||
|
llama_batch_free(batch)
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate the initial prompt
|
||||||
|
batch.n_tokens = Int32(tokens.count)
|
||||||
|
|
||||||
|
for (i, token) in tokens.enumerated() {
|
||||||
|
batch.token[i] = token
|
||||||
|
batch.pos[i] = Int32(i)
|
||||||
|
batch.seq_id[i] = 0
|
||||||
|
batch.logits[i] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
batch.logits[Int(batch.n_tokens) - 1] = 1
|
||||||
|
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 1 ..< n_parallel {
|
||||||
|
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("generating \(n_parallel) sequences ...\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
var streams: [String] = .init(repeating: "", count: n_parallel)
|
||||||
|
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
|
||||||
|
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
|
||||||
|
|
||||||
|
var n_cur = batch.n_tokens
|
||||||
|
var n_decode = 0
|
||||||
|
|
||||||
|
let t_main_start = ggml_time_us()
|
||||||
|
|
||||||
|
while n_cur <= n_len {
|
||||||
|
// prepare the next batch
|
||||||
|
batch.n_tokens = 0
|
||||||
|
|
||||||
|
// sample the next token for each parallel sequence / stream
|
||||||
|
for i in 0 ..< n_parallel {
|
||||||
|
if i_batch[i] < 0 {
|
||||||
|
// the stream has already finished
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var n_vocab = llama_n_vocab(model)
|
||||||
|
var logits = llama_get_logits_ith(context, i_batch[i])
|
||||||
|
|
||||||
|
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
||||||
|
|
||||||
|
for token_id in 0 ..< n_vocab {
|
||||||
|
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
var candidates_p: llama_token_data_array = .init(
|
||||||
|
data: &candidates,
|
||||||
|
size: candidates.count,
|
||||||
|
sorted: false
|
||||||
|
)
|
||||||
|
|
||||||
|
let top_k: Int32 = 40
|
||||||
|
let top_p: Float = 0.9
|
||||||
|
let temp: Float = 0.4
|
||||||
|
|
||||||
|
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
||||||
|
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
||||||
|
llama_sample_temp(context, &candidates_p, temp)
|
||||||
|
|
||||||
|
let new_token_id = llama_sample_token(context, &candidates_p)
|
||||||
|
|
||||||
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
|
// is it an end of stream? -> mark the stream as finished
|
||||||
|
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||||
|
i_batch[i] = -1
|
||||||
|
// print("")
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("stream \(i) finished at n_cur = \(n_cur)")
|
||||||
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
|
||||||
|
|
||||||
|
// if there is only one stream, we print immediately to stdout
|
||||||
|
if n_parallel == 1 {
|
||||||
|
print(nextStringPiece, terminator: "")
|
||||||
|
}
|
||||||
|
streams[i] += nextStringPiece
|
||||||
|
|
||||||
|
// push this new token for next evaluation
|
||||||
|
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||||
|
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||||
|
batch.seq_id[Int(batch.n_tokens)] = Int32(i)
|
||||||
|
batch.logits[Int(batch.n_tokens)] = 1
|
||||||
|
|
||||||
|
i_batch[i] = batch.n_tokens
|
||||||
|
|
||||||
|
batch.n_tokens += 1
|
||||||
|
|
||||||
|
n_decode += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// all streams are finished
|
||||||
|
if batch.n_tokens == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
n_cur += 1
|
||||||
|
|
||||||
|
// evaluate the current batch with the transformer model
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("\n")
|
||||||
|
for (i, stream) in streams.enumerated() {
|
||||||
|
print("sequence \(i):\n\n\(prompt)\(stream)\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
||||||
|
|
||||||
|
llama_print_timings(context)
|
||||||
|
|
||||||
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
|
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||||
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
|
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
|
||||||
|
var swiftTokens: [llama_token] = []
|
||||||
|
for i in 0 ..< tokenCount {
|
||||||
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
}
|
||||||
|
tokens.deallocate()
|
||||||
|
return swiftTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
|
||||||
|
var result = [CChar](repeating: 0, count: 8)
|
||||||
|
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
||||||
|
if nTokens < 0 {
|
||||||
|
if result.count >= -Int(nTokens) {
|
||||||
|
result.removeLast(-Int(nTokens))
|
||||||
|
} else {
|
||||||
|
result.removeAll()
|
||||||
|
}
|
||||||
|
let check = llama_token_to_piece(
|
||||||
|
model,
|
||||||
|
token,
|
||||||
|
&result,
|
||||||
|
Int32(result.count)
|
||||||
|
)
|
||||||
|
assert(check == nTokens)
|
||||||
|
} else {
|
||||||
|
result.removeLast(result.count - Int(nTokens))
|
||||||
|
}
|
||||||
|
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
|
||||||
|
return utfString
|
||||||
|
} else {
|
||||||
|
buffer.append(contentsOf: result)
|
||||||
|
let data = Data(buffer.map { UInt8(bitPattern: $0) })
|
||||||
|
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
|
||||||
|
buffer = []
|
||||||
|
}
|
||||||
|
guard let bufferString = String(data: data, encoding: .utf8) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
buffer = []
|
||||||
|
return bufferString
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.seed = 1234;
|
ctx_params.seed = 1234;
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||||
ctx_params.n_threads = params.n_threads;
|
ctx_params.n_threads = params.n_threads;
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
|
@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
||||||
llama_token sampling_id(struct MyModel* mymodel) {
|
llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
llama_context* ctx = mymodel->ctx;
|
llama_context* ctx = mymodel->ctx;
|
||||||
gpt_params params = mymodel->params;
|
gpt_params params = mymodel->params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
// int n_ctx = llama_n_ctx(ctx);
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = sparams.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
|
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = sparams.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = sparams.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = sparams.typical_p;
|
||||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
// const float repeat_penalty = params.repeat_penalty;
|
// const float repeat_penalty = params.repeat_penalty;
|
||||||
// const float alpha_presence = params.presence_penalty;
|
// const float alpha_presence = params.presence_penalty;
|
||||||
// const float alpha_frequency = params.frequency_penalty;
|
// const float alpha_frequency = params.frequency_penalty;
|
||||||
const int mirostat = params.mirostat;
|
const int mirostat = sparams.mirostat;
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
const float mirostat_tau = sparams.mirostat_tau;
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = sparams.mirostat_eta;
|
||||||
// const bool penalize_nl = params.penalize_nl;
|
// const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
// Apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,7 @@ static void sigint_handler(int signo) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -269,9 +270,9 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
@ -312,7 +313,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -358,7 +359,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -376,8 +377,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -434,6 +435,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
|
@ -552,7 +554,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
last_tokens.push_back(id);
|
last_tokens.push_back(id);
|
||||||
|
|
|
@ -55,19 +55,19 @@ bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch,
|
||||||
|
|
||||||
llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.sampling_params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.top_k;
|
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.sampling_params.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.sampling_params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.sampling_params.typical_p;
|
||||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
|
||||||
// const float repeat_penalty = params.repeat_penalty;
|
// const float repeat_penalty = params.sampling_params.repeat_penalty;
|
||||||
// const float alpha_presence = params.presence_penalty;
|
// const float alpha_presence = params.sampling_params.presence_penalty;
|
||||||
// const float alpha_frequency = params.frequency_penalty;
|
// const float alpha_frequency = params.sampling_params.frequency_penalty;
|
||||||
const int mirostat = params.mirostat;
|
const int mirostat = params.sampling_params.mirostat;
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
const float mirostat_tau = params.sampling_params.mirostat_tau;
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = params.sampling_params.mirostat_eta;
|
||||||
// const bool penalize_nl = params.penalize_nl;
|
// const bool penalize_nl = params.sampling_params.penalize_nl;
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
{
|
{
|
||||||
|
@ -75,7 +75,7 @@ llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
// Apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
|
@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
@ -343,7 +344,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -395,7 +396,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -413,8 +414,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -469,6 +470,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
|
@ -625,7 +627,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
last_tokens.push_back(id);
|
last_tokens.push_back(id);
|
||||||
|
|
|
@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
|
||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
|
@ -339,7 +341,7 @@ int main(int argc, char ** argv) {
|
||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
|
const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
|
@ -384,7 +386,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
n_total_prompt += client.n_prompt;
|
n_total_prompt += client.n_prompt;
|
||||||
n_total_gen += client.n_decoded;
|
n_total_gen += client.n_decoded;
|
||||||
|
llama_sampling_context_reset(ctx_sampling, client.seq_id);
|
||||||
client.seq_id = -1;
|
client.seq_id = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,10 @@
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
params.seed = 42;
|
params.seed = 42;
|
||||||
params.n_threads = 4;
|
params.n_threads = 4;
|
||||||
params.repeat_last_n = 64;
|
sparams.repeat_last_n = 64;
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto n_past = 0;
|
auto n_past = 0;
|
||||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
|
|
@ -200,6 +200,7 @@ struct llama_server_context
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_context ctx_sampling;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
grammar_parser::parse_state parsed_grammar;
|
||||||
|
@ -254,6 +255,7 @@ struct llama_server_context
|
||||||
if (grammar != nullptr) {
|
if (grammar != nullptr) {
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free(grammar);
|
||||||
grammar = nullptr;
|
grammar = nullptr;
|
||||||
|
ctx_sampling = llama_sampling_context_init(params, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -329,8 +331,8 @@ struct llama_server_context
|
||||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -339,6 +341,7 @@ struct llama_server_context
|
||||||
grammar = llama_grammar_init(
|
grammar = llama_grammar_init(
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
}
|
}
|
||||||
|
ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -550,12 +553,12 @@ struct llama_server_context
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(llama_n_vocab(model));
|
candidates.reserve(llama_n_vocab(model));
|
||||||
|
|
||||||
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
const int32_t n_probs = params.n_probs;
|
const int32_t n_probs = params.sampling_params.n_probs;
|
||||||
if (params.temp <= 0 && n_probs > 0)
|
if (params.sampling_params.temp <= 0 && n_probs > 0)
|
||||||
{
|
{
|
||||||
// For llama_sample_token_greedy we need to sort candidates
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
llama_sample_softmax(ctx, &candidates_p);
|
||||||
|
@ -630,7 +633,7 @@ struct llama_server_context
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (params.n_probs > 0)
|
if (params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
generated_token_probs.push_back(token_with_probs);
|
generated_token_probs.push_back(token_with_probs);
|
||||||
}
|
}
|
||||||
|
@ -711,15 +714,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf("usage: %s [options]\n", argv0);
|
printf("usage: %s [options]\n", argv0);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
{
|
{
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
|
@ -864,6 +868,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--threads-batch" || arg == "-tb")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_batch = std::stoi(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "-b" || arg == "--batch-size")
|
else if (arg == "-b" || arg == "--batch-size")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -1018,34 +1031,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
|
|
||||||
static json format_generation_settings(llama_server_context &llama)
|
static json format_generation_settings(llama_server_context &llama)
|
||||||
{
|
{
|
||||||
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
|
const auto & sparams = llama.params.sampling_params;
|
||||||
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
|
||||||
|
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
|
||||||
return json{
|
return json{
|
||||||
{"n_ctx", llama.n_ctx},
|
{"n_ctx", llama.n_ctx},
|
||||||
{"model", llama.params.model_alias},
|
{"model", llama.params.model_alias},
|
||||||
{"seed", llama.params.seed},
|
{"seed", llama.params.seed},
|
||||||
{"temp", llama.params.temp},
|
{"temp", sparams.temp},
|
||||||
{"top_k", llama.params.top_k},
|
{"top_k", sparams.top_k},
|
||||||
{"top_p", llama.params.top_p},
|
{"top_p", sparams.top_p},
|
||||||
{"tfs_z", llama.params.tfs_z},
|
{"tfs_z", sparams.tfs_z},
|
||||||
{"typical_p", llama.params.typical_p},
|
{"typical_p", sparams.typical_p},
|
||||||
{"repeat_last_n", llama.params.repeat_last_n},
|
{"repeat_last_n", sparams.repeat_last_n},
|
||||||
{"repeat_penalty", llama.params.repeat_penalty},
|
{"repeat_penalty", sparams.repeat_penalty},
|
||||||
{"presence_penalty", llama.params.presence_penalty},
|
{"presence_penalty", sparams.presence_penalty},
|
||||||
{"frequency_penalty", llama.params.frequency_penalty},
|
{"frequency_penalty", sparams.frequency_penalty},
|
||||||
{"mirostat", llama.params.mirostat},
|
{"mirostat", sparams.mirostat},
|
||||||
{"mirostat_tau", llama.params.mirostat_tau},
|
{"mirostat_tau", sparams.mirostat_tau},
|
||||||
{"mirostat_eta", llama.params.mirostat_eta},
|
{"mirostat_eta", sparams.mirostat_eta},
|
||||||
{"penalize_nl", llama.params.penalize_nl},
|
{"penalize_nl", sparams.penalize_nl},
|
||||||
{"stop", llama.params.antiprompt},
|
{"stop", llama.params.antiprompt},
|
||||||
{"n_predict", llama.params.n_predict},
|
{"n_predict", llama.params.n_predict},
|
||||||
{"n_keep", llama.params.n_keep},
|
{"n_keep", llama.params.n_keep},
|
||||||
{"ignore_eos", ignore_eos},
|
{"ignore_eos", ignore_eos},
|
||||||
{"stream", llama.stream},
|
{"stream", llama.stream},
|
||||||
{"logit_bias", llama.params.logit_bias},
|
{"logit_bias", sparams.logit_bias},
|
||||||
{"n_probs", llama.params.n_probs},
|
{"n_probs", sparams.n_probs},
|
||||||
{"grammar", llama.params.grammar},
|
{"grammar", llama.params.grammar},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1094,7 +1108,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||||
{"timings", format_timings(llama)},
|
{"timings", format_timings(llama)},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1110,7 +1124,7 @@ static json format_partial_response(
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1142,26 +1156,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||||
{
|
{
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
const auto & default_sparams = default_params.sampling_params;
|
||||||
|
auto & sparams = llama.params.sampling_params;
|
||||||
|
|
||||||
llama.stream = json_value(body, "stream", false);
|
llama.stream = json_value(body, "stream", false);
|
||||||
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
||||||
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
|
sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
|
||||||
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
|
sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
|
||||||
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
|
sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
|
sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
|
||||||
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
|
sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
|
||||||
llama.params.temp = json_value(body, "temperature", default_params.temp);
|
sparams.temp = json_value(body, "temperature", default_sparams.temp);
|
||||||
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
|
sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
|
||||||
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
|
sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
|
||||||
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
|
sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
|
||||||
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
|
sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
|
||||||
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
|
sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
|
sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||||
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
if (body.count("prompt") != 0)
|
if (body.count("prompt") != 0)
|
||||||
{
|
{
|
||||||
|
@ -1172,10 +1188,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
llama.prompt = "";
|
llama.prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
llama.params.logit_bias.clear();
|
sparams.logit_bias.clear();
|
||||||
if (json_value(body, "ignore_eos", false))
|
if (json_value(body, "ignore_eos", false))
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto &logit_bias = body.find("logit_bias");
|
const auto &logit_bias = body.find("logit_bias");
|
||||||
|
@ -1191,11 +1207,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
if (el[1].is_number())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = el[1].get<float>();
|
sparams.logit_bias[tok] = el[1].get<float>();
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = -INFINITY;
|
sparams.logit_bias[tok] = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1215,6 +1231,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
|
||||||
|
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1423,7 +1441,7 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
|
|
||||||
auto probs = llama.generated_token_probs;
|
auto probs = llama.generated_token_probs;
|
||||||
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||||
}
|
}
|
||||||
|
@ -1475,7 +1493,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
@ -1596,7 +1614,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
|
|
@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
|
||||||
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
|
||||||
|
|
||||||
const auto t_dec_start = ggml_time_us();
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
|
||||||
|
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
|
@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
|
||||||
if (grammar_dft) {
|
if (grammar_dft) {
|
||||||
llama_grammar_free(grammar_dft);
|
llama_grammar_free(grammar_dft);
|
||||||
}
|
}
|
||||||
grammar_dft = llama_grammar_copy(grammar_tgt);
|
// Note: Hardcoded to sequence id 0, if this ever supports parallel generation
|
||||||
|
// that will need to change.
|
||||||
|
auto it = ctx_sampling.sequence_contexts.find(0);
|
||||||
|
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
|
||||||
|
// This is necessary because each sequence id in sequence_contexts
|
||||||
|
// uses a copy of the original grammar.
|
||||||
|
grammar_dft = llama_grammar_copy(it->second.grammar);
|
||||||
|
|
||||||
LOG("copied target grammar to draft grammar\n");
|
LOG("copied target grammar to draft grammar\n");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue