diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 815db6a2d..c7fa2203a 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,5 +1,6 @@
{
lib,
+ glibc,
config,
stdenv,
mkShell,
@@ -30,6 +31,11 @@
useRocm ? config.rocmSupport,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+
+ # It's necessary to consistently use backendStdenv when building with CUDA support,
+ # otherwise we get libstdc++ errors downstream.
+ effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
+ enableStatic ? effectiveStdenv.hostPlatform.isStatic
}@inputs:
let
@@ -41,10 +47,7 @@ let
versionOlder
;
- # It's necessary to consistently use backendStdenv when building with CUDA support,
- # otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead";
- effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices =
lib.optionals useBlas [ "BLAS" ]
@@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
+ ]
+ ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
+ glibc.static
];
buildInputs =
@@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
[
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true)
- (cmakeBool "BUILD_SHARED_LIBS" true)
+ (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
@@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
+ (cmakeBool "LLAMA_STATIC" enableStatic)
]
++ optionals useCuda [
(
diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix
index 7535ca0f3..7a5e1dd0f 100644
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@@ -7,7 +7,7 @@
}:
let
- optionalInt = cond: x: if cond then x else 0;
+ optionalInt = cond: x: if cond then x else 0;
in
singularity-tools.buildImage rec {
inherit (llama-cpp) name;
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 66ad85938..9144f9266 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -145,6 +145,28 @@ jobs:
cd build
ctest -L main --verbose
+ ubuntu-22-cmake-vulkan:
+ runs-on: ubuntu-22.04
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get install build-essential libvulkan-dev
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake -DLLAMA_VULKAN=ON ..
+ cmake --build . --config Release -j $(nproc)
+
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 92e1108b3..b82205992 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -3,12 +3,14 @@ name: Python check requirements.txt
on:
push:
paths:
+ - '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
pull_request:
paths:
+ - '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
@@ -26,4 +28,4 @@ jobs:
with:
python-version: "3.11"
- name: Run check-requirements.sh script
- run: bash scripts/check-requirements.sh nocleanup
+ run: bash scripts/check-requirements.sh
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 0b6f6669b..f9aeefaa8 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -3,6 +3,11 @@ name: Server
on:
workflow_dispatch: # allows manual triggering
+ inputs:
+ slow_tests:
+ description: 'Run slow tests'
+ required: true
+ type: boolean
push:
branches:
- master
@@ -10,6 +15,8 @@ on:
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+ schedule:
+ - cron: '0 0 * * *'
jobs:
server:
@@ -51,7 +58,8 @@ jobs:
cmake \
python3-pip \
wget \
- psmisc
+ psmisc \
+ language-pack-en
- name: Build
id: cmake_build
@@ -70,14 +78,15 @@ jobs:
run: |
pip install -r examples/server/tests/requirements.txt
- - name: Download models
- id: download_models
- run: |
- cd examples/server/tests
- ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
-
- name: Tests
- id: server_integration_test
+ id: server_integration_tests
run: |
cd examples/server/tests
PORT=8888 ./tests.sh
+
+ - name: Slow tests
+ id: server_integration_tests_slow
+ if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
+ run: |
+ cd examples/server/tests
+ PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
diff --git a/Makefile b/Makefile
index 241bbeb5a..3459c5470 100644
--- a/Makefile
+++ b/Makefile
@@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
- $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
- $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/README-sycl.md b/README-sycl.md
index dd5bf9dea..85eb16f2b 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -1,6 +1,7 @@
# llama.cpp for SYCL
- [Background](#background)
+- [News](#news)
- [OS](#os)
- [Intel GPU](#intel-gpu)
- [Docker](#docker)
@@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+## News
+
+- 2024.3
+ - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
+ - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
+ - Support detecting all GPUs with level-zero and same top **Max compute units**.
+ - Support OPs
+ - hardsigmoid
+ - hardswish
+ - pool2d
+
+- 2024.1
+ - Create SYCL backend for Intel GPU.
+ - Support Windows build
+
## OS
|OS|Status|Verified|
@@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
## Known Issue
@@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Solution: add **--no-mmap** or **--mmap 0**.
+- Split-mode: [row] is not supported
+
+ It's on developing.
+
## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
diff --git a/README.md b/README.md
index 5401e197f..f754022de 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,14 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+### Recent API changes
+
+- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
+- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
+
### Hot topics
+- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
@@ -785,7 +791,7 @@ And after 4.45 hours, you will have the final perplexity.
### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
@@ -849,7 +855,7 @@ Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMa.
+ - Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
diff --git a/ci/run.sh b/ci/run.sh
index f3a29c2e9..51f4c74cc 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,8 @@ fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
- echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+ echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
+ echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi
@@ -272,19 +273,19 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
- (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
- (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
- (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
- (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
- (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
- (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
- (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
- (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
- (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
- (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+ (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
- (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+ (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
@@ -343,17 +344,17 @@ function gg_run_open_llama_3b_v2 {
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
- (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
- (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
- (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
- (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
- (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3859bba81..82a19d0de 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
endif()
endif()
- set(GIT_INDEX "${GIT_DIR}/index")
+ if(EXISTS "${GIT_DIR}/index")
+ set(GIT_INDEX "${GIT_DIR}/index")
+ else()
+ message(WARNING "Git index not found in git repository.")
+ set(GIT_INDEX "")
+ endif()
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
diff --git a/common/common.cpp b/common/common.cpp
index 18289755c..c244db644 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -335,6 +335,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
+ } else if (arg == "--pooling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+ else { invalid_param = true; break; }
} else if (arg == "--defrag-thold" || arg == "-dt") {
if (++i >= argc) {
invalid_param = true;
@@ -503,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_sequences = std::stoi(argv[i]);
- } else if (arg == "--p-accept" || arg == "-pa") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.p_accept = std::stof(argv[i]);
} else if (arg == "--p-split" || arg == "-ps") {
if (++i >= argc) {
invalid_param = true;
@@ -640,6 +644,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+ fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+ exit(1);
+#endif // GGML_USE_SYCL
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else {
invalid_param = true;
@@ -1010,12 +1018,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
+ printf(" --pooling {none,mean,cls}\n");
+ printf(" pooling type for embeddings, use model default if unspecified\n");
printf(" -dt N, --defrag-thold N\n");
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
- printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
+ printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
@@ -1028,7 +1038,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
- printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
@@ -1281,10 +1290,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
- cparams.mul_mat_q = params.mul_mat_q;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
- cparams.embedding = params.embedding;
+ cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale;
@@ -1293,6 +1301,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
+ cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
cparams.offload_kqv = !params.no_kv_offload;
@@ -1725,7 +1734,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
- fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
diff --git a/common/common.h b/common/common.h
index 25003df26..977ce419f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -43,7 +43,7 @@ extern char const *LLAMA_BUILD_TARGET;
int32_t get_num_physical_cores();
struct gpt_params {
- uint32_t seed = -1; // RNG seed
+ uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
@@ -53,11 +53,10 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_draft = 8; // number of tokens to draft during speculative decoding
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
- float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
@@ -76,8 +75,11 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
- int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
- ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+
+ llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+ llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@@ -115,7 +117,6 @@ struct gpt_params {
bool kl_divergence = false; // compute KL-divergence
- bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
diff --git a/common/log.h b/common/log.h
index e4e1b9f4f..eb111e784 100644
--- a/common/log.h
+++ b/common/log.h
@@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
- #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
+ #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
@@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
- #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
+ #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
@@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
- #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
- #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
+ #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+ #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
diff --git a/common/sampling.cpp b/common/sampling.cpp
index b0b960b73..5e8588b19 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -300,6 +300,77 @@ static llama_token llama_sampling_sample_impl(
return id;
}
+static llama_token_data_array llama_sample_probability_distribution_impl(
+ struct llama_sampling_context * ctx_sampling,
+ struct llama_context * ctx_main,
+ struct llama_context * ctx_cfg,
+ const int idx) {
+ const llama_sampling_params & params = ctx_sampling->params;
+
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+ const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+ const float penalty_repeat = params.penalty_repeat;
+ const float penalty_freq = params.penalty_freq;
+ const float penalty_present = params.penalty_present;
+ const bool penalize_nl = params.penalize_nl;
+
+ auto & prev = ctx_sampling->prev;
+ auto & cur = ctx_sampling->cur;
+
+ // Get a pointer to the logits
+ float * logits = llama_get_logits_ith(ctx_main, idx);
+
+ // Declare original_logits at the beginning of the function scope
+ std::vector original_logits;
+
+ // apply params.logit_bias map
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+ logits[it->first] += it->second;
+ }
+
+ if (ctx_cfg) {
+ float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
+ llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+ }
+
+ cur.clear();
+
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+ cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+ }
+
+ llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+
+ // apply penalties
+ const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+ const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+ if (penalty_tokens_used_size) {
+ const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+
+ llama_sample_repetition_penalties(ctx_main, &cur_p,
+ penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+ penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+
+ if (!penalize_nl) {
+ for (size_t idx = 0; idx < cur_p.size; idx++) {
+ if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+ cur_p.data[idx].logit = nl_logit;
+ break;
+ }
+ }
+ }
+ }
+
+ // apply grammar checks
+ if (ctx_sampling->grammar != NULL) {
+ llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+ }
+
+ llama_sample_softmax(ctx_main, &cur_p);
+ return cur_p;
+}
+
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
@@ -309,6 +380,14 @@ llama_token llama_sampling_sample(
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
}
+llama_token_data_array llama_sampling_probability_distribution(
+ struct llama_sampling_context * ctx_sampling,
+ struct llama_context * ctx_main,
+ struct llama_context * ctx_cfg,
+ const int idx) {
+ return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
+}
+
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
diff --git a/common/sampling.h b/common/sampling.h
index 95d875394..48b2459d1 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -131,6 +131,13 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg,
int idx = 0);
+// returns the probability that token of given id will be sampled
+llama_token_data_array llama_sampling_probability_distribution(
+ struct llama_sampling_context * ctx_sampling,
+ struct llama_context * ctx_main,
+ struct llama_context * ctx_cfg,
+ int idx = 0);
+
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ae30b2a76..f6369af38 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -8,9 +8,10 @@ import json
import os
import re
import sys
+from abc import ABC, abstractmethod
from enum import IntEnum
from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
import numpy as np
import torch
@@ -36,7 +37,12 @@ class SentencePieceTokenTypes(IntEnum):
BYTE = 6
-class Model:
+AnyModel = TypeVar("AnyModel", bound="type[Model]")
+
+
+class Model(ABC):
+ _model_classes: dict[str, type[Model]] = {}
+
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
self.dir_model = dir_model
self.ftype = ftype
@@ -47,10 +53,14 @@ class Model:
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
- self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
+ @property
+ @abstractmethod
+ def model_arch(self) -> gguf.MODEL_ARCH:
+ pass
+
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in self.hparams), None)
if key is not None:
@@ -96,9 +106,11 @@ class Model:
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv)
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
+ self.gguf_writer.add_rope_freq_base(rope_theta)
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
- if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
+ if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
@@ -174,53 +186,22 @@ class Model:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
- @staticmethod
- def from_model_architecture(model_architecture):
- if model_architecture == "GPTNeoXForCausalLM":
- return GPTNeoXModel
- if model_architecture == "BloomForCausalLM":
- return BloomModel
- if model_architecture == "MPTForCausalLM":
- return MPTModel
- if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
- return BaichuanModel
- if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
- return FalconModel
- if model_architecture == "GPTBigCodeForCausalLM":
- return StarCoderModel
- if model_architecture == "GPTRefactForCausalLM":
- return RefactModel
- if model_architecture == "PersimmonForCausalLM":
- return PersimmonModel
- if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
- return StableLMModel
- if model_architecture == "QWenLMHeadModel":
- return QwenModel
- if model_architecture == "Qwen2ForCausalLM":
- return Model
- if model_architecture == "MixtralForCausalLM":
- return MixtralModel
- if model_architecture == "GPT2LMHeadModel":
- return GPT2Model
- if model_architecture == "PhiForCausalLM":
- return Phi2Model
- if model_architecture == "PlamoForCausalLM":
- return PlamoModel
- if model_architecture == "CodeShellForCausalLM":
- return CodeShellModel
- if model_architecture == "OrionForCausalLM":
- return OrionModel
- if model_architecture == "InternLM2ForCausalLM":
- return InternLM2Model
- if model_architecture == "MiniCPMForCausalLM":
- return MiniCPMModel
- if model_architecture == "BertModel":
- return BertModel
- if model_architecture == "NomicBertModel":
- return NomicBertModel
- if model_architecture == "GemmaForCausalLM":
- return GemmaModel
- return Model
+ @classmethod
+ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
+ assert names
+
+ def func(modelcls: type[Model]):
+ for name in names:
+ cls._model_classes[name] = modelcls
+ return modelcls
+ return func
+
+ @classmethod
+ def from_model_architecture(cls, arch):
+ try:
+ return cls._model_classes[arch]
+ except KeyError:
+ raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
@@ -235,55 +216,6 @@ class Model:
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
- def _get_model_architecture(self) -> gguf.MODEL_ARCH:
- arch = self.hparams["architectures"][0]
- if arch == "GPTNeoXForCausalLM":
- return gguf.MODEL_ARCH.GPTNEOX
- if arch == "BloomForCausalLM":
- return gguf.MODEL_ARCH.BLOOM
- if arch == "MPTForCausalLM":
- return gguf.MODEL_ARCH.MPT
- if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
- return gguf.MODEL_ARCH.BAICHUAN
- if arch in ("FalconForCausalLM", "RWForCausalLM"):
- return gguf.MODEL_ARCH.FALCON
- if arch == "GPTBigCodeForCausalLM":
- return gguf.MODEL_ARCH.STARCODER
- if arch == "GPTRefactForCausalLM":
- return gguf.MODEL_ARCH.REFACT
- if arch == "PersimmonForCausalLM":
- return gguf.MODEL_ARCH.PERSIMMON
- if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
- return gguf.MODEL_ARCH.STABLELM
- if arch == "QWenLMHeadModel":
- return gguf.MODEL_ARCH.QWEN
- if arch == "Qwen2ForCausalLM":
- return gguf.MODEL_ARCH.QWEN2
- if arch == "MixtralForCausalLM":
- return gguf.MODEL_ARCH.LLAMA
- if arch == "GPT2LMHeadModel":
- return gguf.MODEL_ARCH.GPT2
- if arch == "PhiForCausalLM":
- return gguf.MODEL_ARCH.PHI2
- if arch == "PlamoForCausalLM":
- return gguf.MODEL_ARCH.PLAMO
- if arch == "CodeShellForCausalLM":
- return gguf.MODEL_ARCH.CODESHELL
- if arch == "OrionForCausalLM":
- return gguf.MODEL_ARCH.ORION
- if arch == "InternLM2ForCausalLM":
- return gguf.MODEL_ARCH.INTERNLM2
- if arch == "MiniCPMForCausalLM":
- return gguf.MODEL_ARCH.MINICPM
- if arch == "BertModel":
- return gguf.MODEL_ARCH.BERT
- if arch == "NomicBertModel":
- return gguf.MODEL_ARCH.NOMIC_BERT
- if arch == "GemmaForCausalLM":
- return gguf.MODEL_ARCH.GEMMA
-
- raise NotImplementedError(f'Architecture "{arch}" not supported!')
-
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
@@ -451,7 +383,10 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer)
+@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model):
+ model_arch = gguf.MODEL_ARCH.GPTNEOX
+
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
@@ -468,7 +403,10 @@ class GPTNeoXModel(Model):
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+@Model.register("BloomForCausalLM")
class BloomModel(Model):
+ model_arch = gguf.MODEL_ARCH.BLOOM
+
def set_gguf_parameters(self):
self.gguf_writer.add_name("Bloom")
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@@ -560,7 +498,10 @@ class BloomModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+@Model.register("MPTForCausalLM")
class MPTModel(Model):
+ model_arch = gguf.MODEL_ARCH.MPT
+
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
@@ -623,7 +564,10 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("OrionForCausalLM")
class OrionModel(Model):
+ model_arch = gguf.MODEL_ARCH.ORION
+
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -702,7 +646,10 @@ class OrionModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
class BaichuanModel(Model):
+ model_arch = gguf.MODEL_ARCH.BAICHUAN
+
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -817,7 +764,10 @@ class BaichuanModel(Model):
return weights[r * n_part:r * n_part + r, ...]
+@Model.register("FalconForCausalLM", "RWForCausalLM")
class FalconModel(Model):
+ model_arch = gguf.MODEL_ARCH.FALCON
+
def set_gguf_parameters(self):
block_count = self.hparams.get("num_hidden_layers")
if block_count is None:
@@ -910,7 +860,10 @@ class FalconModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("GPTBigCodeForCausalLM")
class StarCoderModel(Model):
+ model_arch = gguf.MODEL_ARCH.STARCODER
+
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
@@ -925,7 +878,10 @@ class StarCoderModel(Model):
self.gguf_writer.add_file_type(self.ftype)
+@Model.register("GPTRefactForCausalLM")
class RefactModel(Model):
+ model_arch = gguf.MODEL_ARCH.REFACT
+
def set_gguf_parameters(self):
hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim
@@ -1009,7 +965,10 @@ class RefactModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("PersimmonForCausalLM")
class PersimmonModel(Model):
+ model_arch = gguf.MODEL_ARCH.PERSIMMON
+
def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"]
@@ -1057,7 +1016,10 @@ class PersimmonModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model):
+ model_arch = gguf.MODEL_ARCH.STABLELM
+
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
@@ -1081,12 +1043,18 @@ class StableLMModel(Model):
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+@Model.register("MixtralForCausalLM")
class MixtralModel(Model):
+ model_arch = gguf.MODEL_ARCH.LLAMA
+
def set_vocab(self):
self._set_vocab_sentencepiece()
+@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model):
+ model_arch = gguf.MODEL_ARCH.MINICPM
+
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name("MiniCPM")
@@ -1163,7 +1131,10 @@ class MiniCPMModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("QWenLMHeadModel")
class QwenModel(Model):
+ model_arch = gguf.MODEL_ARCH.QWEN
+
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@@ -1243,7 +1214,15 @@ class QwenModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("Qwen2ForCausalLM")
+class Qwen2Model(Model):
+ model_arch = gguf.MODEL_ARCH.QWEN2
+
+
+@Model.register("GPT2LMHeadModel")
class GPT2Model(Model):
+ model_arch = gguf.MODEL_ARCH.GPT2
+
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
@@ -1305,7 +1284,10 @@ class GPT2Model(Model):
self.gguf_writer.add_tensor("output.weight", data)
+@Model.register("PhiForCausalLM")
class Phi2Model(Model):
+ model_arch = gguf.MODEL_ARCH.PHI2
+
def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
@@ -1327,7 +1309,10 @@ class Phi2Model(Model):
self.gguf_writer.add_add_bos_token(False)
+@Model.register("PlamoForCausalLM")
class PlamoModel(Model):
+ model_arch = gguf.MODEL_ARCH.PLAMO
+
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -1406,7 +1391,10 @@ class PlamoModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("CodeShellForCausalLM")
class CodeShellModel(Model):
+ model_arch = gguf.MODEL_ARCH.CODESHELL
+
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
@@ -1471,7 +1459,10 @@ class CodeShellModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+@Model.register("InternLM2ForCausalLM")
class InternLM2Model(Model):
+ model_arch = gguf.MODEL_ARCH.INTERNLM2
+
def set_vocab(self):
# (TODO): Is there a better way?
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
@@ -1643,7 +1634,10 @@ in chat mode so that the conversation can end normally.")
self.post_write_tensors(tensor_map, name, data_torch)
+@Model.register("BertModel")
class BertModel(Model):
+ model_arch = gguf.MODEL_ARCH.BERT
+
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.vocab_size = None
@@ -1653,16 +1647,17 @@ class BertModel(Model):
self.gguf_writer.add_causal_attention(False)
# get pooling path
- with open(self.dir_model / "modules.json", encoding="utf-8") as f:
- modules = json.load(f)
pooling_path = None
- for mod in modules:
- if mod["type"] == "sentence_transformers.models.Pooling":
- pooling_path = mod["path"]
- break
+ module_path = self.dir_model / "modules.json"
+ if module_path.is_file():
+ with open(module_path, encoding="utf-8") as f:
+ modules = json.load(f)
+ for mod in modules:
+ if mod["type"] == "sentence_transformers.models.Pooling":
+ pooling_path = mod["path"]
+ break
# get pooling type
- pooling_type = gguf.PoolingType.NONE
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
@@ -1672,8 +1667,7 @@ class BertModel(Model):
pooling_type = gguf.PoolingType.CLS
else:
raise NotImplementedError("Only MEAN and CLS pooling types supported")
-
- self.gguf_writer.add_pooling_type(pooling_type.value)
+ self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
path = self.dir_model
@@ -1749,7 +1743,10 @@ class BertModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("NomicBertModel")
class NomicBertModel(BertModel):
+ model_arch = gguf.MODEL_ARCH.NOMIC_BERT
+
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -1786,7 +1783,10 @@ class NomicBertModel(BertModel):
yield name, data
+@Model.register("GemmaForCausalLM")
class GemmaModel(Model):
+ model_arch = gguf.MODEL_ARCH.GEMMA
+
def set_vocab(self):
self._set_vocab_sentencepiece()
@@ -1811,16 +1811,15 @@ class GemmaModel(Model):
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
- # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
- if name.endswith("norm.weight"):
- data_torch = data_torch + 1
-
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
+ # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+ if name.endswith("norm.weight"):
+ data_torch = data_torch + 1
data = data_torch.squeeze().numpy()
# map tensor names
@@ -1843,6 +1842,11 @@ class GemmaModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("Starcoder2ForCausalLM")
+class StarCoder2Model(Model):
+ model_arch = gguf.MODEL_ARCH.STARCODER2
+
+
###### CONVERSION LOGIC ######
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
index b33108062..cd9644fcb 100755
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata')
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
vocab_factory = convert.VocabFactory(vocab_path)
- vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
+ vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return params, vocab, special_vocab
@@ -398,8 +398,8 @@ def handle_args():
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
- parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
- help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+ parser.add_argument("--vocabtype", default="spm,hfft",
+ help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
return parser.parse_args()
diff --git a/convert.py b/convert.py
index 63a0a5d78..c15f8c47e 100755
--- a/convert.py
+++ b/convert.py
@@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory:
+ _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
+
def __init__(self, path: Path):
self.path = path
- self.files: dict[str, Path | None] = {
- "tokenizer.model": None,
- "vocab.json": None,
- "tokenizer.json": None,
- }
- self._detect_files()
+ self.file_paths = self._detect_files()
+ print(f"Found vocab files: {self.file_paths}")
- def _detect_files(self):
- for file in self.files.keys():
- file_path = self.path / file
- parent_file_path = self.path.parent / file
- if file_path.exists():
- self.files[file] = file_path
- elif parent_file_path.exists():
- self.files[file] = parent_file_path
- print(f"Found vocab files: {self.files}")
+ def _detect_files(self) -> dict[str, Path | None]:
+ def locate(file: str) -> Path | None:
+ if (path := self.path / file).exists():
+ return path
+ if (path := self.path.parent / file).exists():
+ return path
+ return None
- def _select_file(self, vocabtype: str | None) -> Path:
- if vocabtype in ["spm", "bpe"]:
- for file_key in self.files.keys():
- if (file := self.files[file_key]) is not None:
- return file
- raise FileNotFoundError(f"{vocabtype} vocab not found.")
- if vocabtype == "hfft":
- # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
- return self.path
- raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+ return {vt: locate(f) for vt, f in self._FILES.items()}
+
+ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
+ for vtype in vocab_types:
+ try:
+ path = self.file_paths[vtype]
+ except KeyError:
+ raise ValueError(f"Unsupported vocabulary type {vtype}") from None
+ if path is not None:
+ return vtype, path
+ raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
@@ -1322,30 +1319,30 @@ class VocabFactory:
n_vocab=n_vocab,
)
- def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
- path = self._select_file(vocabtype)
- print(f"Loading vocab file '{path}', type '{vocabtype}'")
+ def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+ vocab_type, path = self._select_file(vocab_types)
+ print(f"Loading vocab file {path!r}, type {vocab_type!r}")
added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
- if vocabtype == "bpe":
+ if vocab_type == "bpe":
vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
- elif vocabtype == "spm":
+ elif vocab_type == "spm":
vocab = SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
- elif vocabtype == "hfft":
+ elif vocab_type == "hfft":
vocab = HfVocab(
- path, added_tokens_path if added_tokens_path.exists() else None
+ path.parent, added_tokens_path if added_tokens_path.exists() else None
)
else:
- raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+ raise ValueError(vocab_type)
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab(
vocab,
- vocabtype,
+ vocab_type,
model_parent_path,
)
return vocab, special_vocab
@@ -1379,15 +1376,13 @@ def main(args_in: list[str] | None = None) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
- vocab_types = ["spm", "bpe", "hfft"]
- parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
- parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
+ parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
- parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+ parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@@ -1397,18 +1392,6 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
- if args.awq_path:
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
- tmp_model_path = args.model / "weighted_model"
- if tmp_model_path.is_dir():
- print(f"{tmp_model_path} exists as a weighted model.")
- else:
- tmp_model_path.mkdir(parents=True, exist_ok=True)
- print("Saving new weighted model ...")
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
- print(f"Saved weighted model at {tmp_model_path}.")
- args.model = tmp_model_path
if args.dump_single:
model_plus = lazy_load_file(args.model)
@@ -1448,7 +1431,7 @@ def main(args_in: list[str] | None = None) -> None:
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
- vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
+ vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
if args.vocab_only:
if not args.outfile:
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index b4b8a38e1..19aff18ae 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,16 +32,15 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] \n" , argv[0]);
+ printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]);
printf(" , and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+ printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int is_pp_shared = 0;
int n_gpu_layers = 0;
- int mmq = 0;
std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector n_tg = { 128, 256, };
@@ -65,19 +64,15 @@ int main(int argc, char ** argv) {
}
if (argc >= 6) {
- mmq = std::atoi(argv[5]);
+ n_pp = parse_list(argv[5]);
}
if (argc >= 7) {
- n_pp = parse_list(argv[6]);
+ n_tg = parse_list(argv[6]);
}
if (argc >= 8) {
- n_tg = parse_list(argv[7]);
- }
-
- if (argc >= 9) {
- n_pl = parse_list(argv[8]);
+ n_pl = parse_list(argv[7]);
}
// init LLM
@@ -106,7 +101,6 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
- ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -159,7 +153,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index acff715e9..ff5883da6 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -19,11 +19,11 @@ static std::vector split_lines(const std::string & s) {
static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
- llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+ llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
}
}
-static void normalize(float * vec, float * out, int n) {
+static void normalize(const float * vec, float * out, int n) {
float norm = 0;
for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i];
@@ -45,10 +45,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
// normalize on copy
- for (int k = 0; k < n_seq; k++) {
- float * emb = llama_get_embeddings_ith(ctx, k);
- float * out = output + k * n_embd;
- normalize(emb, out, n_embd);
+ for (int i = 0; i < batch.n_tokens; i++) {
+ if (!batch.logits[i]) {
+ continue;
+ }
+
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
+ const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+ if (embd == NULL) {
+ embd = llama_get_embeddings_ith(ctx, i);
+ if (embd == NULL) {
+ fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+ continue;
+ }
+ }
+
+ float * out = output + batch.seq_id[i][0] * n_embd;
+ normalize(embd, out, n_embd);
}
}
@@ -132,7 +145,7 @@ int main(int argc, char ** argv) {
// initialize batch
const int n_prompts = prompts.size();
- struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
+ struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output
const int n_embd = llama_n_embd(model);
@@ -145,6 +158,7 @@ int main(int argc, char ** argv) {
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];
+
const uint64_t n_toks = inp.size();
// encode if at capacity
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index d4b8729dd..91c39c5ae 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
- control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+ control_message = " - To return control to LLaMA, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
- control_message = " - Press Return to return control to LLaMa.\n"
+ control_message = " - Press Return to return control to LLaMA.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 374e40a7d..10f37b441 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -35,7 +35,6 @@ options:
-mg, --main-gpu (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
- -mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split (default: 0)
-r, --repetitions (default: 5)
-o, --output (default: md)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8fec3d43d..2ff86ef6f 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -123,20 +123,15 @@ static std::string get_gpu_info() {
}
#endif
#ifdef GGML_USE_SYCL
- int device_list[GGML_SYCL_MAX_DEVICES];
- ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
-
- for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
- if (device_list[i] >0 ){
- char buf[128];
- ggml_sycl_get_device_description(i, buf, sizeof(buf));
- id += buf;
+ int count = ggml_backend_sycl_get_device_count();
+ for (int i = 0; i < count; i++) {
+ char buf[128];
+ ggml_sycl_get_device_description(i, buf, sizeof(buf));
+ id += buf;
+ if (i < count - 1) {
id += "/";
}
}
- if (id.length() >2 ) {
- id.pop_back();
- }
#endif
// TODO: other backends
return id;
@@ -176,9 +171,9 @@ struct cmd_params {
std::vector split_mode;
std::vector main_gpu;
std::vector no_kv_offload;
- std::vector mul_mat_q;
std::vector> tensor_split;
std::vector use_mmap;
+ std::vector embeddings;
int reps;
bool verbose;
output_formats output_format;
@@ -196,9 +191,9 @@ static const cmd_params cmd_params_defaults = {
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
- /* mul_mat_q */ {true},
/* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
+ /* embeddings */ {false},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@@ -221,7 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
- printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+ printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor_split (default: 0)\n");
printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@@ -383,13 +378,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
- } else if (arg == "-mmq" || arg == "--mul-mat-q") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- auto p = split(argv[i], split_delim);
- params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@@ -397,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+ } else if (arg == "-embd" || arg == "--embeddings") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split(argv[i], split_delim);
+ params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
@@ -466,9 +461,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
- if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
+ if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
@@ -486,9 +481,9 @@ struct cmd_params_instance {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
- bool mul_mat_q;
std::vector tensor_split;
bool use_mmap;
+ bool embeddings;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -518,8 +513,8 @@ struct cmd_params_instance {
cparams.n_batch = n_batch;
cparams.type_k = type_k;
cparams.type_v = type_v;
- cparams.mul_mat_q = mul_mat_q;
cparams.offload_kqv = !no_kv_offload;
+ cparams.embeddings = embeddings;
return cparams;
}
@@ -535,10 +530,10 @@ static std::vector get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
+ for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
- for (const auto & mmq : params.mul_mat_q)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
@@ -557,9 +552,9 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
- /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
+ /* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -580,9 +575,9 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
- /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
+ /* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -616,9 +611,9 @@ struct test {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
- bool mul_mat_q;
std::vector tensor_split;
bool use_mmap;
+ bool embeddings;
int n_prompt;
int n_gen;
std::string test_time;
@@ -639,9 +634,9 @@ struct test {
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
- mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
+ embeddings = inst.embeddings;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
@@ -713,7 +708,7 @@ struct test {
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
- "mul_mat_q", "tensor_split", "use_mmap",
+ "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
@@ -733,7 +728,7 @@ struct test {
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
- field == "mul_mat_q" || field == "use_mmap") {
+ field == "use_mmap" || field == "embeddings") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -767,7 +762,7 @@ struct test {
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
- std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
+ tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -931,15 +926,15 @@ struct markdown_printer : public printer {
if (field == "n_threads") {
return "threads";
}
- if (field == "mul_mat_q") {
- return "mmq";
- }
if (field == "no_kv_offload") {
return "nkvo";
}
if (field == "use_mmap") {
return "mmap";
}
+ if (field == "embeddings") {
+ return "embd";
+ }
if (field == "tensor_split") {
return "ts";
}
@@ -974,9 +969,6 @@ struct markdown_printer : public printer {
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.emplace_back("split_mode");
}
- if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
- fields.emplace_back("mul_mat_q");
- }
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload");
}
@@ -986,6 +978,9 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
+ if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
+ fields.emplace_back("embeddings");
+ }
fields.emplace_back("test");
fields.emplace_back("t/s");
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 34e84d0d4..47059e582 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -511,6 +511,14 @@ int main(int argc, char ** argv) {
std::vector embd;
std::vector embd_guidance;
+ // tokenized antiprompts
+ std::vector> antiprompt_ids;
+
+ antiprompt_ids.reserve(params.antiprompt.size());
+ for (const std::string & antiprompt : params.antiprompt) {
+ antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
+ }
+
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
@@ -769,6 +777,18 @@ int main(int argc, char ** argv) {
}
}
+ // check for reverse prompt using special tokens
+ llama_token last_token = llama_sampling_last(ctx_sampling);
+ for (std::vector ids : antiprompt_ids) {
+ if (ids.size() == 1 && last_token == ids[0]) {
+ if (params.interactive) {
+ is_interacting = true;
+ }
+ is_antiprompt = true;
+ break;
+ }
+ }
+
if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str());
}
diff --git a/examples/server-embd.py b/examples/server-embd.py
new file mode 100644
index 000000000..118e04271
--- /dev/null
+++ b/examples/server-embd.py
@@ -0,0 +1,34 @@
+import asyncio
+import requests
+import numpy as np
+
+n = 8
+
+result = []
+
+async def requests_post_async(*args, **kwargs):
+ return await asyncio.to_thread(requests.post, *args, **kwargs)
+
+async def main():
+ model_url = "http://127.0.0.1:6900"
+ responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
+ url= f"{model_url}/embedding",
+ json= {"content": str(0)*1024}
+ ) for i in range(n)])
+
+ for response in responses:
+ embedding = response.json()["embedding"]
+ print(embedding[-8:])
+ result.append(embedding)
+
+asyncio.run(main())
+
+# compute cosine similarity
+
+for i in range(n-1):
+ for j in range(i+1, n):
+ embedding1 = np.array(result[i])
+ embedding2 = np.array(result[j])
+ similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+ print(f"Similarity between {i} and {j}: {similarity:.2f}")
+
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index cc13b2d63..c21eba634 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,12 +1,12 @@
set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$
)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
diff --git a/examples/server/README.md b/examples/server/README.md
index 582278770..d0ab9709d 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -18,6 +18,7 @@ The project is under active development, and we are [looking for feedback and co
- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
@@ -339,7 +340,7 @@ where:
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
-- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
*Options:*
@@ -449,7 +450,7 @@ where:
"next_token": {
"has_next_token": true,
"n_remain": -1,
- "num_tokens_predicted": 0,
+ "n_decoded": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
@@ -541,20 +542,7 @@ bash chat.sh
### API like OAI
-API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
-This example must be used with server.cpp
-
-```sh
-python api_like_OAI.py
-```
-
-After running the API server, you can use it in Python by setting the API base URL.
-
-```python
-openai.api_base = "http://:port"
-```
-
-Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
+The HTTP server supports OAI-like API
### Extending or building alternative Web Front End
diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
deleted file mode 100755
index 607fe49d3..000000000
--- a/examples/server/api_like_OAI.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from flask import Flask, jsonify, request, Response
-import urllib.parse
-import requests
-import time
-import json
-
-
-app = Flask(__name__)
-slot_id = -1
-
-parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
-parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
-parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
-parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
-parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
-parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '')", default="")
-parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
-parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
-parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
-parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
-
-args = parser.parse_args()
-
-def is_present(json, key):
- try:
- buf = json[key]
- except KeyError:
- return False
- if json[key] == None:
- return False
- return True
-
-#convert chat to prompt
-def convert_chat(messages):
-
- system_n = args.system_name
- user_n = args.user_name
- ai_n = args.ai_name
- stop = args.stop
-
- prompt = "" + args.chat_prompt + stop
-
- for line in messages:
- if (line["role"] == "system"):
- prompt += f"{system_n}{line['content']}{stop}"
- if (line["role"] == "user"):
- prompt += f"{user_n}{line['content']}{stop}"
- if (line["role"] == "assistant"):
- prompt += f"{ai_n}{line['content']}{stop}"
- prompt += ai_n.rstrip()
-
- return prompt
-
-def make_postData(body, chat=False, stream=False):
- postData = {}
- if (chat):
- postData["prompt"] = convert_chat(body["messages"])
- else:
- postData["prompt"] = body["prompt"]
- if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
- if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
- if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
- if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
- if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
- if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
- if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
- if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
- if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
- if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
- if(is_present(body, "seed")): postData["seed"] = body["seed"]
- if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
- if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
- if (args.stop != ""):
- postData["stop"] = [args.stop]
- else:
- postData["stop"] = []
- if(is_present(body, "stop")): postData["stop"] += body["stop"]
- postData["n_keep"] = -1
- postData["stream"] = stream
- postData["cache_prompt"] = True
- postData["slot_id"] = slot_id
- return postData
-
-def make_resData(data, chat=False, promptToken=[]):
- resData = {
- "id": "chatcmpl" if (chat) else "cmpl",
- "object": "chat.completion" if (chat) else "text_completion",
- "created": int(time.time()),
- "truncated": data["truncated"],
- "model": "LLaMA_CPP",
- "usage": {
- "prompt_tokens": data["tokens_evaluated"],
- "completion_tokens": data["tokens_predicted"],
- "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
- }
- }
- if (len(promptToken) != 0):
- resData["promptToken"] = promptToken
- if (chat):
- #only one choice is supported
- resData["choices"] = [{
- "index": 0,
- "message": {
- "role": "assistant",
- "content": data["content"],
- },
- "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
- }]
- else:
- #only one choice is supported
- resData["choices"] = [{
- "text": data["content"],
- "index": 0,
- "logprobs": None,
- "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
- }]
- return resData
-
-def make_resData_stream(data, chat=False, time_now = 0, start=False):
- resData = {
- "id": "chatcmpl" if (chat) else "cmpl",
- "object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
- "created": time_now,
- "model": "LLaMA_CPP",
- "choices": [
- {
- "finish_reason": None,
- "index": 0
- }
- ]
- }
- slot_id = data.get("slot_id")
- if (chat):
- if (start):
- resData["choices"][0]["delta"] = {
- "role": "assistant"
- }
- else:
- resData["choices"][0]["delta"] = {
- "content": data["content"]
- }
- if (data["stop"]):
- resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
- else:
- resData["choices"][0]["text"] = data["content"]
- if (data["stop"]):
- resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
-
- return resData
-
-
-@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
-@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
-def chat_completions():
- if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
- return Response(status=403)
- if request.method == 'OPTIONS':
- return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
- body = request.get_json()
- stream = False
- tokenize = False
- if(is_present(body, "stream")): stream = body["stream"]
- if(is_present(body, "tokenize")): tokenize = body["tokenize"]
- postData = make_postData(body, chat=True, stream=stream)
-
- promptToken = []
- if (tokenize):
- tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
- promptToken = tokenData["tokens"]
-
- if (not stream):
- data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
- print(data.json())
- resData = make_resData(data.json(), chat=True, promptToken=promptToken)
- return jsonify(resData)
- else:
- def generate():
- data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
- time_now = int(time.time())
- resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
- yield 'data: {}\n\n'.format(json.dumps(resData))
- for line in data.iter_lines():
- if line:
- decoded_line = line.decode('utf-8')
- resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
- yield 'data: {}\n\n'.format(json.dumps(resData))
- return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
-
-
-@app.route('/completions', methods=['POST', 'OPTIONS'])
-@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
-def completion():
- if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
- return Response(status=403)
- if request.method == 'OPTIONS':
- return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
- body = request.get_json()
- stream = False
- tokenize = False
- if(is_present(body, "stream")): stream = body["stream"]
- if(is_present(body, "tokenize")): tokenize = body["tokenize"]
- postData = make_postData(body, chat=False, stream=stream)
-
- promptToken = []
- if (tokenize):
- tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
- promptToken = tokenData["tokens"]
-
- if (not stream):
- data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
- print(data.json())
- resData = make_resData(data.json(), chat=False, promptToken=promptToken)
- return jsonify(resData)
- else:
- def generate():
- data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
- time_now = int(time.time())
- for line in data.iter_lines():
- if line:
- decoded_line = line.decode('utf-8')
- resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
- yield 'data: {}\n\n'.format(json.dumps(resData))
- return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
-
-if __name__ == '__main__':
- app.run(args.host, port=args.port)
diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp
deleted file mode 100644
index ff4ad6994..000000000
--- a/examples/server/oai.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "json.hpp"
-#include "utils.hpp"
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-
-using json = nlohmann::json;
-
-inline static json oaicompat_completion_params_parse(
- const struct llama_model * model,
- const json &body, /* openai api json semantics */
- const std::string &chat_template)
-{
- json llama_params;
-
- llama_params["__oaicompat"] = true;
-
- // Map OpenAI parameters to llama.cpp parameters
- //
- // For parameters that are defined by the OpenAI documentation (e.g.
- // temperature), we explicitly specify OpenAI's intended default; we
- // need to do that because sometimes OpenAI disagrees with llama.cpp
- //
- // https://platform.openai.com/docs/api-reference/chat/create
- llama_sampling_params default_sparams;
- llama_params["model"] = json_value(body, "model", std::string("unknown"));
- llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
- llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
- llama_params["temperature"] = json_value(body, "temperature", 0.0);
- llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
- llama_params["top_p"] = json_value(body, "top_p", 1.0);
- llama_params["n_predict"] = json_value(body, "max_tokens", -1);
- llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
- llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
- llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
- llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
- llama_params["stream"] = json_value(body, "stream", false);
- llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
- llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
- llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
- llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
- llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
- llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
- llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
- llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
-
- if (body.count("grammar") != 0) {
- llama_params["grammar"] = json_value(body, "grammar", json::object());
- }
-
- // Handle 'stop' field
- if (body.contains("stop") && body["stop"].is_string()) {
- llama_params["stop"] = json::array({body["stop"].get()});
- } else {
- llama_params["stop"] = json_value(body, "stop", json::array());
- }
-
- // Ensure there is ChatML-specific end sequence among stop words
- llama_params["stop"].push_back("<|im_end|>");
-
- return llama_params;
-}
-
-inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
- json result = response.result_json;
-
- bool stopped_word = result.count("stopped_word") != 0;
- bool stopped_eos = json_value(result, "stopped_eos", false);
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
- std::string content = json_value(result, "content", std::string(""));
-
- std::string finish_reason = "length";
- if (stopped_word || stopped_eos) {
- finish_reason = "stop";
- }
-
- json choices =
- streaming ? json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"delta", json::object()}}})
- : json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"message", json{{"content", content},
- {"role", "assistant"}}}}});
-
- std::time_t t = std::time(0);
-
- json res =
- json{{"choices", choices},
- {"created", t},
- {"model",
- json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
- {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
- {"usage",
- json{{"completion_tokens", num_tokens_predicted},
- {"prompt_tokens", num_prompt_tokens},
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
- {"id", gen_chatcmplid()}};
-
- if (server_verbose) {
- res["__verbose"] = result;
- }
-
- if (result.contains("completion_probabilities")) {
- res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
- }
-
- return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-inline static std::vector format_partial_response_oaicompat(const task_result &response) {
- json result = response.result_json;
-
- if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
- return std::vector({response.result_json});
- }
-
- bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
- std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
- bool stopped_word = json_value(result, "stopped_word", false);
- bool stopped_eos = json_value(result, "stopped_eos", false);
- bool stopped_limit = json_value(result, "stopped_limit", false);
- std::string content = json_value(result, "content", std::string(""));
-
- std::string finish_reason;
- if (stopped_word || stopped_eos) {
- finish_reason = "stop";
- }
- if (stopped_limit) {
- finish_reason = "length";
- }
-
- std::time_t t = std::time(0);
-
- json choices;
-
- if (!finish_reason.empty()) {
- choices = json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"delta", json::object()}}});
- } else {
- if (first) {
- if (content.empty()) {
- choices = json::array({json{{"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{{"role", "assistant"}}}}});
- } else {
- // We have to send this as two updates to conform to openai behavior
- json initial_ret = json{{"choices", json::array({json{
- {"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{
- {"role", "assistant"}
- }}}})},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- json second_ret = json{
- {"choices", json::array({json{{"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{
- {"content", content}}}
- }})},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- return std::vector({initial_ret, second_ret});
- }
- } else {
- // Some idiosyncrasy in task processing logic makes several trailing calls
- // with empty content, we ignore these at the calee site.
- if (content.empty()) {
- return std::vector({json::object()});
- }
-
- choices = json::array({json{
- {"finish_reason", nullptr},
- {"index", 0},
- {"delta",
- json{
- {"content", content},
- }},
- }});
- }
- }
-
- json ret = json{{"choices", choices},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- return std::vector({ret});
-}
-
-inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
-{
- json res =
- json{
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
- {"object", "list"},
- {"usage",
- json{{"prompt_tokens", 0},
- {"total_tokens", 0}}},
- {"data", embeddings}
- };
- return res;
-}
-
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f9f93ccca..de247f12f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,13 +1,8 @@
+#include "utils.hpp"
+
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
-#include "utils.hpp"
-#include "oai.hpp"
-
-#include "../llava/clip.h"
-#include "../llava/llava.h"
-
-#include "stb_image.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@@ -25,135 +20,118 @@
#include "json-schema-to-grammar.mjs.hpp"
#include "error.h"
-#include
-#include
+#include
#include
#include
-#include
+#include
+#include
+#include
+#include
#include
using json = nlohmann::json;
-struct server_params
-{
- std::string hostname = "127.0.0.1";
- std::vector api_keys;
- std::string public_path = "examples/server/public";
- std::string chat_template = "";
- int32_t port = 8080;
- int32_t read_timeout = 600;
- int32_t write_timeout = 600;
- bool slots_endpoint = true;
- bool metrics_endpoint = false;
-};
-
bool server_verbose = false;
bool server_log_json = true;
-static size_t common_part(const std::vector &a, const std::vector &b)
-{
- size_t i;
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
- {
- }
- return i;
-}
-
-enum stop_type
-{
- STOP_FULL,
- STOP_PARTIAL,
+enum stop_type {
+ STOP_TYPE_FULL,
+ STOP_TYPE_PARTIAL,
};
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
- return str.size() >= suffix.size() &&
- 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
+enum slot_state {
+ SLOT_STATE_IDLE,
+ SLOT_STATE_PROCESSING,
+};
-static size_t find_partial_stop_string(const std::string &stop,
- const std::string &text)
-{
- if (!text.empty() && !stop.empty())
- {
- const char text_last_char = text.back();
- for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
- {
- if (stop[char_index] == text_last_char)
- {
- const std::string current_partial = stop.substr(0, char_index + 1);
- if (ends_with(text, current_partial))
- {
- return text.size() - char_index - 1;
- }
- }
- }
- }
- return std::string::npos;
-}
+enum slot_command {
+ SLOT_COMMAND_NONE,
+ SLOT_COMMAND_LOAD_PROMPT,
+ SLOT_COMMAND_RELEASE,
+};
-// TODO: reuse llama_detokenize
-template
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
- std::string ret;
- for (; begin != end; ++begin)
- {
- ret += llama_token_to_piece(ctx, *begin);
- }
- return ret;
-}
+enum server_state {
+ SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
+ SERVER_STATE_READY, // Server is ready and model is loaded
+ SERVER_STATE_ERROR // An error occurred, load_model failed
+};
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
- // if the size is 1 and first bit is 1, meaning it's a partial character
- // (size > 1 meaning it's already a known token)
- if (out.size() == 1 && (out[0] & 0x80) == 0x80)
- {
- std::stringstream ss;
- ss << std::hex << (out[0] & 0xff);
- std::string res(ss.str());
- out = "byte: \\x" + res;
- }
- return out;
-}
+enum server_task_type {
+ SERVER_TASK_TYPE_COMPLETION,
+ SERVER_TASK_TYPE_CANCEL,
+ SERVER_TASK_TYPE_NEXT_RESPONSE,
+ SERVER_TASK_TYPE_METRICS
+};
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs)
-{
- json out = json::array();
- for (const auto &prob : probs)
- {
- json probs_for_token = json::array();
- for (const auto &p : prob.probs)
- {
- std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
- probs_for_token.push_back(json
- {
- {"tok_str", tok_str},
- {"prob", p.prob},
- });
- }
- std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
- out.push_back(json{
- {"content", tok_str},
- {"probs", probs_for_token},
- });
- }
- return out;
-}
+struct server_task {
+ int id = -1; // to be filled by server_queue
+ int id_multi = -1;
+ int id_target = -1;
-struct llama_client_slot
-{
+ server_task_type type;
+ json data;
+
+ bool infill = false;
+ bool embedding = false;
+};
+
+struct server_task_result {
+ int id = -1;
+ int id_multi = -1;
+
+ json data;
+
+ bool stop;
+ bool error;
+};
+
+struct server_task_multi {
+ int id = -1;
+
+ std::set subtasks_remaining;
+ std::vector results;
+};
+
+struct slot_params {
+ bool stream = true;
+ bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+ uint32_t seed = -1; // RNG seed
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_predict = -1; // new tokens to predict
+
+ std::vector antiprompt;
+
+ json input_prefix;
+ json input_suffix;
+};
+
+struct server_params {
+ int32_t port = 8080;
+ int32_t read_timeout = 600;
+ int32_t write_timeout = 600;
+ int32_t n_threads_http = -1;
+
+ std::string hostname = "127.0.0.1";
+ std::string public_path = "examples/server/public";
+ std::string chat_template = "";
+ std::string system_prompt = "";
+
+ std::vector api_keys;
+
+ bool slots_endpoint = true;
+ bool metrics_endpoint = false;
+};
+
+struct server_slot {
int id;
- int task_id = -1;
+ int id_task = -1;
+ int id_multi = -1;
struct slot_params params;
- slot_state state = IDLE;
- slot_command command = NONE;
+ slot_state state = SLOT_STATE_IDLE;
+ slot_command command = SLOT_COMMAND_NONE;
// used to determine the slot that has been used the longest
int64_t t_last_used = -1;
@@ -166,31 +144,35 @@ struct llama_client_slot
int32_t i_batch = -1;
int32_t n_predict = -1;
- int32_t num_prompt_tokens = 0;
- int32_t num_prompt_tokens_processed = 0;
+ int32_t n_prompt_tokens = 0;
+ int32_t n_prompt_tokens_processed = 0;
json prompt;
+
+ // when a task is submitted, we first tokenize the prompt and store it here
+ std::vector prompt_tokens;
+
std::string generated_text;
- llama_token sampled;
std::vector cache_tokens;
std::vector generated_token_probs;
- bool infill = false;
- bool embedding = false;
+ bool infill = false;
+ bool embedding = false;
bool has_next_token = true;
- bool truncated = false;
- bool stopped_eos = false;
- bool stopped_word = false;
- bool stopped_limit = false;
+ bool truncated = false;
+ bool stopped_eos = false;
+ bool stopped_word = false;
+ bool stopped_limit = false;
bool oaicompat = false;
- std::string oaicompat_model;
+ std::string oaicompat_model;
std::string stopping_word;
// sampling
+ llama_token sampled;
struct llama_sampling_params sparams;
- llama_sampling_context *ctx_sampling = nullptr;
+ llama_sampling_context * ctx_sampling = nullptr;
int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor
@@ -198,65 +180,44 @@ struct llama_client_slot
int32_t n_past_se = 0; // self-extend
- // multimodal
- std::vector images;
-
// stats
- size_t sent_count = 0;
- size_t sent_token_probs_index = 0;
+ size_t n_sent_text = 0; // number of sent text character
+ size_t n_sent_token_probs = 0;
int64_t t_start_process_prompt;
- int64_t t_start_genereration;
+ int64_t t_start_generation;
double t_prompt_processing; // ms
double t_token_generation; // ms
- // multitasks
- int multitask_id = -1;
-
void reset() {
- num_prompt_tokens = 0;
- generated_text = "";
- truncated = false;
- stopped_eos = false;
- stopped_word = false;
- stopped_limit = false;
- stopping_word = "";
- n_past = 0;
- sent_count = 0;
- sent_token_probs_index = 0;
- infill = false;
- ga_i = 0;
- n_past_se = 0;
+ n_prompt_tokens = 0;
+ generated_text = "";
+ truncated = false;
+ stopped_eos = false;
+ stopped_word = false;
+ stopped_limit = false;
+ stopping_word = "";
+ n_past = 0;
+ n_sent_text = 0;
+ n_sent_token_probs = 0;
+ infill = false;
+ ga_i = 0;
+ n_past_se = 0;
generated_token_probs.clear();
-
- for (slot_image & img : images)
- {
- free(img.image_embedding);
- if (img.img_data) {
- clip_image_u8_free(img.img_data);
- }
- img.prefix_prompt = "";
- }
-
- images.clear();
}
bool has_budget(gpt_params &global_params) {
- if (params.n_predict == -1 && global_params.n_predict == -1)
- {
+ if (params.n_predict == -1 && global_params.n_predict == -1) {
return true; // limitless
}
n_remaining = -1;
- if (params.n_predict != -1)
- {
+ if (params.n_predict != -1) {
n_remaining = params.n_predict - n_decoded;
- }
- else if (global_params.n_predict != -1)
- {
+ } else if (global_params.n_predict != -1) {
n_remaining = global_params.n_predict - n_decoded;
}
@@ -264,37 +225,33 @@ struct llama_client_slot
}
bool available() const {
- return state == IDLE && command == NONE;
+ return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
}
bool is_processing() const {
- return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+ return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
}
- void add_token_string(const completion_token_output &token) {
- if (command == RELEASE)
- {
+ void add_token_string(const completion_token_output & token) {
+ if (command == SLOT_COMMAND_RELEASE) {
return;
}
- cache_tokens.push_back(token.tok);
generated_token_probs.push_back(token);
}
void release() {
- if (state == PROCESSING)
- {
- t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
- command = RELEASE;
+ if (state == SLOT_STATE_PROCESSING) {
+ t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+ command = SLOT_COMMAND_RELEASE;
}
}
- json get_formated_timings() {
- return json
- {
- {"prompt_n", num_prompt_tokens_processed},
+ json get_formated_timings() const {
+ return json {
+ {"prompt_n", n_prompt_tokens_processed},
{"prompt_ms", t_prompt_processing},
- {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed},
- {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed},
+ {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
+ {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed},
{"predicted_n", n_decoded},
{"predicted_ms", t_token_generation},
@@ -303,40 +260,74 @@ struct llama_client_slot
};
}
+ size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
+ size_t stop_pos = std::string::npos;
+
+ for (const std::string & word : params.antiprompt) {
+ size_t pos;
+
+ if (type == STOP_TYPE_FULL) {
+ const size_t tmp = word.size() + last_token_size;
+ const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+
+ pos = text.find(word, from_pos);
+ } else {
+ pos = find_partial_stop_string(word, text);
+ }
+
+ if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
+ if (type == STOP_TYPE_FULL) {
+ stopped_word = true;
+ stopping_word = word;
+ has_next_token = false;
+ }
+ stop_pos = pos;
+ }
+ }
+
+ return stop_pos;
+ }
+
void print_timings() const {
- char buffer[512];
- double t_token = t_prompt_processing / num_prompt_tokens_processed;
- double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
- sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
- t_prompt_processing, num_prompt_tokens_processed,
+ char buffer[512];
+
+ double t_token = t_prompt_processing / n_prompt_tokens_processed;
+ double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+ snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+ t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
- {"t_prompt_processing", t_prompt_processing},
- {"num_prompt_tokens_processed", num_prompt_tokens_processed},
- {"t_token", t_token},
- {"n_tokens_second", n_tokens_second},
+ {"id_slot", id},
+ {"id_task", id_task},
+ {"t_prompt_processing", t_prompt_processing},
+ {"n_prompt_tokens_processed", n_prompt_tokens_processed},
+ {"t_token", t_token},
+ {"n_tokens_second", n_tokens_second},
});
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
- sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
+
+ snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
+ {"id_slot", id},
+ {"id_task", id_task},
{"t_token_generation", t_token_generation},
{"n_decoded", n_decoded},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
- sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
+ snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
+ {"id_slot", id},
+ {"id_task", id_task},
{"t_prompt_processing", t_prompt_processing},
{"t_token_generation", t_token_generation},
{"t_total", t_prompt_processing + t_token_generation},
@@ -344,29 +335,32 @@ struct llama_client_slot
}
};
-struct llama_metrics {
+struct server_metrics {
+ const int64_t t_start = ggml_time_us();
+
uint64_t n_prompt_tokens_processed_total = 0;
+ uint64_t t_prompt_processing_total = 0;
uint64_t n_tokens_predicted_total = 0;
+ uint64_t t_tokens_generation_total = 0;
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
- uint64_t n_tokens_predicted = 0;
- uint64_t t_tokens_generation = 0;
+ uint64_t n_tokens_predicted = 0;
+ uint64_t t_tokens_generation = 0;
-
- void on_prompt_eval(const llama_client_slot &slot) {
- n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
-
- n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
- t_prompt_processing += slot.t_prompt_processing;
+ void on_prompt_eval(const server_slot &slot) {
+ n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
+ n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
+ t_prompt_processing += slot.t_prompt_processing;
+ t_prompt_processing_total += slot.t_prompt_processing;
}
- void on_prediction(const llama_client_slot &slot) {
- n_tokens_predicted_total += slot.n_decoded;
-
- n_tokens_predicted += slot.n_decoded;
- t_tokens_generation += slot.t_token_generation;
+ void on_prediction(const server_slot &slot) {
+ n_tokens_predicted_total += slot.n_decoded;
+ n_tokens_predicted += slot.n_decoded;
+ t_tokens_generation += slot.t_token_generation;
+ t_tokens_generation_total += slot.t_token_generation;
}
void reset_bucket() {
@@ -377,23 +371,261 @@ struct llama_metrics {
}
};
-struct llama_server_context
-{
- llama_model *model = nullptr;
- llama_context *ctx = nullptr;
+struct server_queue {
+ int id = 0;
+ bool running;
- clip_ctx *clp_ctx = nullptr;
+ // queues
+ std::vector queue_tasks;
+ std::vector queue_tasks_deferred;
+
+ std::vector queue_multitasks;
+
+ std::mutex mutex_tasks;
+ std::condition_variable condition_tasks;
+
+ // callback functions
+ std::function callback_new_task;
+ std::function callback_finish_multitask;
+ std::function callback_run_slots;
+
+ // Add a new task to the end of the queue
+ int post(server_task task) {
+ std::unique_lock lock(mutex_tasks);
+ if (task.id == -1) {
+ task.id = id++;
+ LOG_VERBOSE("new task id", {{"new_id", task.id}});
+ }
+ queue_tasks.push_back(std::move(task));
+ condition_tasks.notify_one();
+ return task.id;
+ }
+
+ // Add a new task, but defer until one slot is available
+ void defer(server_task task) {
+ std::unique_lock lock(mutex_tasks);
+ queue_tasks_deferred.push_back(std::move(task));
+ }
+
+ // Get the next id for creating anew task
+ int get_new_id() {
+ std::unique_lock lock(mutex_tasks);
+ int new_id = id++;
+ LOG_VERBOSE("new task id", {{"new_id", new_id}});
+ return new_id;
+ }
+
+ // Register function to process a new task
+ void on_new_task(std::function callback) {
+ callback_new_task = std::move(callback);
+ }
+
+ // Register function to process a multitask when it is finished
+ void on_finish_multitask(std::function callback) {
+ callback_finish_multitask = std::move(callback);
+ }
+
+ // Register the function to be called when all slots data is ready to be processed
+ void on_run_slots(std::function callback) {
+ callback_run_slots = std::move(callback);
+ }
+
+ // Call when the state of one slot is changed
+ void notify_slot_changed() {
+ // move deferred tasks back to main loop
+ std::unique_lock lock(mutex_tasks);
+ for (auto & task : queue_tasks_deferred) {
+ queue_tasks.push_back(std::move(task));
+ }
+ queue_tasks_deferred.clear();
+ }
+
+ // end the start_loop routine
+ void terminate() {
+ std::unique_lock lock(mutex_tasks);
+ running = false;
+ condition_tasks.notify_all();
+ }
+
+ /**
+ * Main loop consists of these steps:
+ * - Wait until a new task arrives
+ * - Process the task (i.e. maybe copy data into slot)
+ * - Check if multitask is finished
+ * - Run all slots
+ */
+ void start_loop() {
+ running = true;
+
+ while (true) {
+ LOG_VERBOSE("new task may arrive", {});
+
+ while (true) {
+ std::unique_lock lock(mutex_tasks);
+ if (queue_tasks.empty()) {
+ lock.unlock();
+ break;
+ }
+ server_task task = queue_tasks.front();
+ queue_tasks.erase(queue_tasks.begin());
+ lock.unlock();
+ LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
+ callback_new_task(task);
+ }
+
+ LOG_VERBOSE("update_multitasks", {});
+
+ // check if we have any finished multitasks
+ auto queue_iterator = queue_multitasks.begin();
+ while (queue_iterator != queue_multitasks.end()) {
+ if (queue_iterator->subtasks_remaining.empty()) {
+ // all subtasks done == multitask is done
+ server_task_multi current_multitask = *queue_iterator;
+ callback_finish_multitask(current_multitask);
+ // remove this multitask
+ queue_iterator = queue_multitasks.erase(queue_iterator);
+ } else {
+ ++queue_iterator;
+ }
+ }
+
+ // all tasks in the current loop is processed, slots data is now ready
+ LOG_VERBOSE("callback_run_slots", {});
+
+ callback_run_slots();
+
+ LOG_VERBOSE("wait for new task", {});
+ {
+ std::unique_lock lock(mutex_tasks);
+ if (queue_tasks.empty()) {
+ if (!running) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
+ condition_tasks.wait(lock, [&]{
+ return (!queue_tasks.empty() || !running);
+ });
+ }
+ }
+ }
+ }
+
+ //
+ // functions to manage multitasks
+ //
+
+ // add a multitask by specifying the id of all subtask (subtask is a server_task)
+ void add_multitask(int id_multi, std::vector & sub_ids) {
+ std::lock_guard lock(mutex_tasks);
+ server_task_multi multi;
+ multi.id = id_multi;
+ std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+ queue_multitasks.push_back(multi);
+ }
+
+ // updatethe remaining subtasks, while appending results to multitask
+ void update_multitask(int id_multi, int id_sub, server_task_result & result) {
+ std::lock_guard lock(mutex_tasks);
+ for (auto & multitask : queue_multitasks) {
+ if (multitask.id == id_multi) {
+ multitask.subtasks_remaining.erase(id_sub);
+ multitask.results.push_back(result);
+ }
+ }
+ }
+};
+
+struct server_response {
+ typedef std::function callback_multitask_t;
+ callback_multitask_t callback_update_multitask;
+
+ // for keeping track of all tasks waiting for the result
+ std::set waiting_task_ids;
+
+ // the main result queue
+ std::vector queue_results;
+
+ std::mutex mutex_results;
+ std::condition_variable condition_results;
+
+ // add the id_task to the list of tasks waiting for response
+ void add_waiting_task_id(int id_task) {
+ LOG_VERBOSE("waiting for task id", {{"id_task", id_task}});
+
+ std::unique_lock lock(mutex_results);
+ waiting_task_ids.insert(id_task);
+ }
+
+ // when the request is finished, we can remove task associated with it
+ void remove_waiting_task_id(int id_task) {
+ LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
+
+ std::unique_lock lock(mutex_results);
+ waiting_task_ids.erase(id_task);
+ }
+
+ // This function blocks the thread until there is a response for this id_task
+ server_task_result recv(int id_task) {
+ while (true) {
+ std::unique_lock lock(mutex_results);
+ condition_results.wait(lock, [&]{
+ return !queue_results.empty();
+ });
+
+ for (int i = 0; i < (int) queue_results.size(); i++) {
+ if (queue_results[i].id == id_task) {
+ assert(queue_results[i].id_multi == -1);
+ server_task_result res = queue_results[i];
+ queue_results.erase(queue_results.begin() + i);
+ return res;
+ }
+ }
+ }
+
+ // should never reach here
+ }
+
+ // Register the function to update multitask
+ void on_multitask_update(callback_multitask_t callback) {
+ callback_update_multitask = std::move(callback);
+ }
+
+ // Send a new result to a waiting id_task
+ void send(server_task_result result) {
+ LOG_VERBOSE("send new result", {{"id_task", result.id}});
+
+ std::unique_lock lock(mutex_results);
+ for (const auto & id_task : waiting_task_ids) {
+ // LOG_TEE("waiting task id %i \n", id_task);
+ // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+ if (result.id_multi == id_task) {
+ LOG_VERBOSE("callback_update_multitask", {{"id_task", id_task}});
+ callback_update_multitask(id_task, result.id, result);
+ continue;
+ }
+
+ if (result.id == id_task) {
+ LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}});
+ queue_results.push_back(result);
+ condition_results.notify_all();
+ return;
+ }
+ }
+ }
+};
+
+struct server_context {
+ llama_model * model = nullptr;
+ llama_context * ctx = nullptr;
gpt_params params;
llama_batch batch;
- bool multimodal = false;
- bool clean_kv_cache = true;
- bool all_slots_are_idle = false;
- bool add_bos_token = true;
+ bool clean_kv_cache = true;
+ bool add_bos_token = true;
- int32_t n_ctx; // total context for all clients / slots
+ int32_t n_ctx; // total context for all clients / slots
// system prompt
bool system_need_update = false;
@@ -405,63 +637,35 @@ struct llama_server_context
std::string name_assistant;
// slots / clients
- std::vector slots;
+ std::vector slots;
json default_generation_settings_for_props;
- llama_server_queue queue_tasks;
- llama_server_response queue_results;
+ server_queue queue_tasks;
+ server_response queue_results;
- llama_metrics metrics;
+ server_metrics metrics;
- ~llama_server_context()
- {
- if (ctx)
- {
+ ~server_context() {
+ if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
- if (model)
- {
+
+ if (model) {
llama_free_model(model);
model = nullptr;
}
}
- bool load_model(const gpt_params ¶ms_)
- {
+ bool load_model(const gpt_params & params_) {
params = params_;
- if (!params.mmproj.empty()) {
- multimodal = true;
- LOG_INFO("Multi Modal Mode Enabled", {});
- clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
- if(clp_ctx == nullptr) {
- LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
- return false;
- }
-
- if (params.n_ctx < 2048) { // request larger context for the image embedding
- params.n_ctx = 2048;
- }
- }
std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (model == nullptr)
- {
+ if (model == nullptr) {
LOG_ERROR("unable to load model", {{"model", params.model}});
return false;
}
- if (multimodal) {
- const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
- const int n_embd_llm = llama_n_embd(model);
- if (n_embd_clip != n_embd_llm) {
- LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
- llama_free(ctx);
- llama_free_model(model);
- return false;
- }
- }
-
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
@@ -469,33 +673,27 @@ struct llama_server_context
return true;
}
- void validate_model_chat_template(server_params & sparams) {
+ bool validate_model_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
- if (res < 0) {
- LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
- }
+
+ const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+
+ return res > 0;
}
void initialize() {
- // create slots
- all_slots_are_idle = true;
-
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
- for (int i = 0; i < params.n_parallel; i++)
- {
- llama_client_slot slot;
+ for (int i = 0; i < params.n_parallel; i++) {
+ server_slot slot;
slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
LOG_INFO("new slot", {
- {"slot_id", slot.id},
+ {"id_slot", slot.id},
{"n_ctx_slot", slot.n_ctx}
});
@@ -503,15 +701,15 @@ struct llama_server_context
const int ga_w = params.grp_attn_w;
if (ga_n != 1) {
- GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
- GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
+ GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
+ GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_INFO("slot self-extend", {
- {"slot_id", slot.id},
- {"ga_n", ga_n},
- {"ga_w", ga_w}
+ {"id_slot", slot.id},
+ {"ga_n", ga_n},
+ {"ga_w", ga_w}
});
}
@@ -530,8 +728,7 @@ struct llama_server_context
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
}
- std::vector tokenize(const json & json_prompt, bool add_bos) const
- {
+ std::vector tokenize(const json & json_prompt, bool add_bos) const {
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
@@ -541,38 +738,30 @@ struct llama_server_context
// or the first element of the json_prompt array is a string.
std::vector prompt_tokens;
- if (json_prompt.is_array())
- {
+ if (json_prompt.is_array()) {
bool first = true;
- for (const auto& p : json_prompt)
- {
- if (p.is_string())
- {
+ for (const auto & p : json_prompt) {
+ if (p.is_string()) {
auto s = p.template get();
+
std::vector p;
- if (first)
- {
+ if (first) {
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false;
- }
- else
- {
+ } else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
}
+
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
- }
- else
- {
- if (first)
- {
+ } else {
+ if (first) {
first = false;
}
+
prompt_tokens.push_back(p.template get());
}
}
- }
- else
- {
+ } else {
auto s = json_prompt.template get();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
}
@@ -580,19 +769,18 @@ struct llama_server_context
return prompt_tokens;
}
- llama_client_slot* get_slot(int id) {
+ server_slot * get_slot(int id) {
int64_t t_last = ggml_time_us();
- llama_client_slot *last_used = nullptr;
- for (llama_client_slot & slot : slots)
- {
- if (slot.id == id && slot.available())
- {
+ server_slot * last_used = nullptr;
+
+ for (server_slot & slot : slots) {
+ if (slot.id == id && slot.available()) {
return &slot;
}
- if (slot.available() && slot.t_last_used < t_last)
- {
+ // among all available slots, find the one that has been least recently used
+ if (slot.available() && slot.t_last_used < t_last) {
last_used = &slot;
t_last = slot.t_last_used;
}
@@ -601,304 +789,204 @@ struct llama_server_context
return last_used;
}
- bool launch_slot_with_data(llama_client_slot* &slot, json data) {
+ bool launch_slot_with_data(server_slot & slot, json data) const {
slot_params default_params;
llama_sampling_params default_sparams;
if (data.count("__oaicompat") != 0) {
- slot->oaicompat = true;
- slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+ slot.oaicompat = true;
+ slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else {
- slot->oaicompat = false;
- slot->oaicompat_model = "";
+ slot.oaicompat = false;
+ slot.oaicompat_model = "";
}
- slot->params.stream = json_value(data, "stream", false);
- slot->params.cache_prompt = json_value(data, "cache_prompt", false);
- slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
- slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
- slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
- slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
- slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
- slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
- slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
- slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
- slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
- slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
- slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
- slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
- slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
- slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
- slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
- slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
- slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
- slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
- slot->params.seed = json_value(data, "seed", default_params.seed);
- slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
- slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
- slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
+ slot.params.stream = json_value(data, "stream", false);
+ slot.params.cache_prompt = json_value(data, "cache_prompt", false);
+ slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
+ slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
+ slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
+ slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
+ slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
+ slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
+ slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
+ slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
+ slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
+ slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
+ slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
+ slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+ slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
+ slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
+ slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
+ slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
+ slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
+ slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
+ slot.params.seed = json_value(data, "seed", default_params.seed);
+ slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+ slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
+ slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
- if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+ if (slot.params.cache_prompt && slot.ga_n != 1) {
+ LOG_WARNING("cache_prompt is not supported with group-attention", {});
+ slot.params.cache_prompt = false;
+ }
+
+ if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
- {"params.n_predict", slot->params.n_predict},
- {"slot.n_predict", slot->n_predict},
+ {"params.n_predict", slot.params.n_predict},
+ {"slot.n_predict", slot.n_predict},
});
- slot->params.n_predict = slot->n_predict;
+ slot.params.n_predict = slot.n_predict;
}
// infill
- if (data.count("input_prefix") != 0)
- {
- slot->params.input_prefix = data["input_prefix"];
- }
- else
- {
- slot->params.input_prefix = "";
- }
+ slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
+ slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
+ slot.prompt = json_value(data, "prompt", std::string(""));
- if (data.count("input_suffix") != 0)
+ // penalize user-provided tokens
{
- slot->params.input_suffix = data["input_suffix"];
- }
- else
- {
- slot->params.input_suffix = "";
- }
+ slot.sparams.penalty_prompt_tokens.clear();
+ slot.sparams.use_penalty_prompt_tokens = false;
- if (data.count("prompt") != 0)
- {
- slot->prompt = data["prompt"];
- }
- else
- {
- slot->prompt = "";
- }
+ const auto & penalty_prompt = data.find("penalty_prompt");
- if (
- (slot->prompt.is_string() && slot->prompt.get().empty())
- ||
- (slot->prompt.is_array() && slot->prompt.empty())
- )
- {
- throw llama_error("prompt.empty", "The prompt must not be empty");
- }
+ if (penalty_prompt != data.end()) {
+ if (penalty_prompt->is_string()) {
+ const auto penalty_prompt_string = penalty_prompt->get();
+ slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
- slot->sparams.penalty_prompt_tokens.clear();
- slot->sparams.use_penalty_prompt_tokens = false;
- const auto &penalty_prompt = data.find("penalty_prompt");
- if (penalty_prompt != data.end())
- {
- if (penalty_prompt->is_string())
- {
- const auto penalty_prompt_string = penalty_prompt->get();
- auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
- slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
- if (slot->params.n_predict > 0)
- {
- slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
- }
- slot->sparams.use_penalty_prompt_tokens = true;
- }
- else if (penalty_prompt->is_array())
- {
- const auto n_tokens = penalty_prompt->size();
- slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
- const int n_vocab = llama_n_vocab(model);
- for (const auto &penalty_token : *penalty_prompt)
- {
- if (penalty_token.is_number_integer())
- {
- const auto tok = penalty_token.get();
- if (tok >= 0 && tok < n_vocab)
- {
- slot->sparams.penalty_prompt_tokens.push_back(tok);
- }
+ if (slot.params.n_predict > 0) {
+ slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
}
- }
- slot->sparams.use_penalty_prompt_tokens = true;
- }
- }
+ slot.sparams.use_penalty_prompt_tokens = true;
- slot->sparams.logit_bias.clear();
-
- if (json_value(data, "ignore_eos", false))
- {
- slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
- }
-
- const auto &logit_bias = data.find("logit_bias");
- if (logit_bias != data.end() && logit_bias->is_array())
- {
- const int n_vocab = llama_n_vocab(model);
- for (const auto &el : *logit_bias)
- {
- if (el.is_array() && el.size() == 2)
- {
- float bias;
- if (el[1].is_number())
- {
- bias = el[1].get();
- }
- else if (el[1].is_boolean() && !el[1].get())
- {
- bias = -INFINITY;
- }
- else
- {
- continue;
- }
-
- if (el[0].is_number_integer())
- {
- llama_token tok = el[0].get();
- if (tok >= 0 && tok < n_vocab)
- {
- slot->sparams.logit_bias[tok] = bias;
- }
- }
- else if (el[0].is_string())
- {
- auto toks = llama_tokenize(model, el[0].get(), false);
- for (auto tok : toks)
- {
- slot->sparams.logit_bias[tok] = bias;
- }
- }
- }
- }
- }
-
- slot->params.antiprompt.clear();
-
- const auto &stop = data.find("stop");
- if (stop != data.end() && stop->is_array())
- {
- for (const auto &word : *stop)
- {
- if (!word.empty())
- {
- slot->params.antiprompt.push_back(word);
- }
- }
- }
-
- const auto &samplers_sequence = data.find("samplers");
- if (samplers_sequence != data.end() && samplers_sequence->is_array())
- {
- std::vector sampler_names;
- for (const auto &sampler_name : *samplers_sequence)
- {
- if (sampler_name.is_string())
- {
- sampler_names.emplace_back(sampler_name);
- }
- }
- slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
- }
- else
- {
- slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
- }
-
- if (multimodal)
- {
- const auto &images_data = data.find("image_data");
- if (images_data != data.end() && images_data->is_array())
- {
- for (const auto &img : *images_data)
- {
- const std::vector image_buffer = base64_decode(img["data"].get());
-
- slot_image img_sl;
- img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size();
- img_sl.img_data = clip_image_u8_init();
- if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
- {
- LOG_ERROR("failed to load image", {
- {"slot_id", slot->id},
- {"img_sl_id", img_sl.id}
- });
- return false;
- }
- LOG_VERBOSE("image loaded", {
- {"slot_id", slot->id},
- {"img_sl_id", img_sl.id}
+ LOG_VERBOSE("penalty_prompt_tokens", {
+ {"id_slot", slot.id},
+ {"tokens", slot.sparams.penalty_prompt_tokens},
});
- img_sl.request_encode_image = true;
- slot->images.push_back(img_sl);
}
- // process prompt
- // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
- if (slot->images.size() > 0 && !slot->prompt.is_array())
- {
- std::string prompt = slot->prompt.get();
- size_t pos = 0, begin_prefix = 0;
- std::string pattern = "[img-";
- while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
- size_t end_prefix = pos;
- pos += pattern.length();
- size_t end_pos = prompt.find(']', pos);
- if (end_pos != std::string::npos)
- {
- std::string image_id = prompt.substr(pos, end_pos - pos);
- try
- {
- int img_id = std::stoi(image_id);
- bool found = false;
- for (slot_image &img : slot->images)
- {
- if (img.id == img_id) {
- found = true;
- img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
- begin_prefix = end_pos + 1;
- break;
- }
- }
- if (!found) {
- LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
- slot->images.clear();
- return false;
- }
- } catch (const std::invalid_argument& e) {
- LOG_TEE("Invalid image number id in prompt\n");
- slot->images.clear();
- return false;
+ else if (penalty_prompt->is_array()) {
+ const auto n_tokens = penalty_prompt->size();
+ slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
+
+ const int n_vocab = llama_n_vocab(model);
+ for (const auto & penalty_token : *penalty_prompt) {
+ if (penalty_token.is_number_integer()) {
+ const auto tok = penalty_token.get();
+ if (tok >= 0 && tok < n_vocab) {
+ slot.sparams.penalty_prompt_tokens.push_back(tok);
}
}
}
- slot->prompt = "";
- slot->params.input_suffix = prompt.substr(begin_prefix);
- slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
+ slot.sparams.use_penalty_prompt_tokens = true;
+
+ LOG_VERBOSE("penalty_prompt_tokens", {
+ {"id_slot", slot.id},
+ {"tokens", slot.sparams.penalty_prompt_tokens},
+ });
}
}
}
- if (slot->ctx_sampling != nullptr)
{
- llama_sampling_free(slot->ctx_sampling);
- }
- slot->ctx_sampling = llama_sampling_init(slot->sparams);
- llama_set_rng_seed(ctx, slot->params.seed);
- slot->command = LOAD_PROMPT;
+ slot.sparams.logit_bias.clear();
- all_slots_are_idle = false;
+ if (json_value(data, "ignore_eos", false)) {
+ slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+ }
+
+ const auto & logit_bias = data.find("logit_bias");
+ if (logit_bias != data.end() && logit_bias->is_array()) {
+ const int n_vocab = llama_n_vocab(model);
+ for (const auto & el : *logit_bias) {
+ if (el.is_array() && el.size() == 2) {
+ float bias;
+ if (el[1].is_number()) {
+ bias = el[1].get();
+ } else if (el[1].is_boolean() && !el[1].get()) {
+ bias = -INFINITY;
+ } else {
+ continue;
+ }
+
+ if (el[0].is_number_integer()) {
+ llama_token tok = el[0].get();
+ if (tok >= 0 && tok < n_vocab) {
+ slot.sparams.logit_bias[tok] = bias;
+ }
+ } else if (el[0].is_string()) {
+ auto toks = llama_tokenize(model, el[0].get(), false);
+ for (auto tok : toks) {
+ slot.sparams.logit_bias[tok] = bias;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ {
+ slot.params.antiprompt.clear();
+
+ const auto & stop = data.find("stop");
+ if (stop != data.end() && stop->is_array()) {
+ for (const auto & word : *stop) {
+ if (!word.empty()) {
+ slot.params.antiprompt.push_back(word);
+ }
+ }
+ }
+ }
+
+ {
+ const auto & samplers_sequence = data.find("samplers");
+ if (samplers_sequence != data.end() && samplers_sequence->is_array()) {
+ std::vector sampler_names;
+ for (const auto & sampler_name : *samplers_sequence) {
+ if (sampler_name.is_string()) {
+ sampler_names.emplace_back(sampler_name);
+ }
+ }
+ slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+ } else {
+ slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
+ }
+ }
+
+ {
+ if (slot.ctx_sampling != nullptr) {
+ llama_sampling_free(slot.ctx_sampling);
+ }
+ slot.ctx_sampling = llama_sampling_init(slot.sparams);
+ llama_set_rng_seed(ctx, slot.params.seed);
+ }
+
+ slot.command = SLOT_COMMAND_LOAD_PROMPT;
+ slot.prompt_tokens.clear();
LOG_INFO("slot is processing task", {
- {"slot_id", slot->id},
- {"task_id", slot->task_id},
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
});
return true;
}
void kv_cache_clear() {
+ LOG_VERBOSE("clearing KV cache", {});
+
// clear the entire KV cache
llama_kv_cache_clear(ctx);
clean_kv_cache = false;
}
- void update_system_prompt() {
+ void system_prompt_update() {
+ LOG_VERBOSE("system prompt update", {
+ {"system_prompt", system_prompt},
+ });
+
kv_cache_clear();
system_tokens.clear();
@@ -907,13 +995,11 @@ struct llama_server_context
llama_batch_clear(batch);
- for (int i = 0; i < (int)system_tokens.size(); ++i)
- {
+ for (int i = 0; i < (int)system_tokens.size(); ++i) {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
- {
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) {
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
@@ -925,78 +1011,42 @@ struct llama_server_context
batch.logits + i,
0, 0, 0, // unused
};
- if (llama_decode(ctx, batch_view) != 0)
- {
+
+ if (llama_decode(ctx, batch_view) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
}
// assign the system KV cache to all parallel sequences
- for (int32_t i = 1; i < params.n_parallel; ++i)
- {
+ for (int32_t i = 1; i < params.n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
}
}
- LOG_TEE("system prompt updated\n");
system_need_update = false;
}
- void notify_system_prompt_changed() {
+ void system_prompt_set(const json & sys_props) {
+ system_prompt = sys_props.value("prompt", "");
+ name_user = sys_props.value("anti_prompt", "");
+ name_assistant = sys_props.value("assistant_name", "");
+
+ LOG_VERBOSE("system prompt process", {
+ {"system_prompt", system_prompt},
+ {"name_user", name_user},
+ {"name_assistant", name_assistant},
+ });
+
// release all slots
- for (llama_client_slot &slot : slots)
- {
+ for (server_slot & slot : slots) {
slot.release();
}
system_need_update = true;
}
- void process_system_prompt_data(const json &sys_props) {
- system_prompt = sys_props.value("prompt", "");
- name_user = sys_props.value("anti_prompt", "");
- name_assistant = sys_props.value("assistant_name", "");
-
-
- notify_system_prompt_changed();
- }
-
- static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
- const stop_type type, llama_client_slot &slot)
- {
- size_t stop_pos = std::string::npos;
-
- for (const std::string &word : slot.params.antiprompt)
- {
- size_t pos;
- if (type == STOP_FULL)
- {
- const size_t tmp = word.size() + last_token_size;
- const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
- pos = text.find(word, from_pos);
- }
- else
- {
- pos = find_partial_stop_string(word, text);
- }
- if (pos != std::string::npos &&
- (stop_pos == std::string::npos || pos < stop_pos))
- {
- if (type == STOP_FULL)
- {
- slot.stopped_word = true;
- slot.stopping_word = word;
- slot.has_next_token = false;
- }
- stop_pos = pos;
- }
- }
-
- return stop_pos;
- }
-
- bool process_token(completion_token_output &result, llama_client_slot &slot) {
+ bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok);
slot.sampled = result.tok;
@@ -1005,34 +1055,26 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;
- if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
- {
+ if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) {
// we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
}
// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
- for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
- {
+ for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
unsigned char c = slot.generated_text[slot.generated_text.size() - i];
- if ((c & 0xC0) == 0x80)
- {
+ if ((c & 0xC0) == 0x80) {
// continuation byte: 10xxxxxx
continue;
}
- if ((c & 0xE0) == 0xC0)
- {
+ if ((c & 0xE0) == 0xC0) {
// 2-byte character: 110xxxxx ...
incomplete = i < 2;
- }
- else if ((c & 0xF0) == 0xE0)
- {
+ } else if ((c & 0xF0) == 0xE0) {
// 3-byte character: 1110xxxx ...
incomplete = i < 3;
- }
- else if ((c & 0xF8) == 0xF0)
- {
+ } else if ((c & 0xF8) == 0xF0) {
// 4-byte character: 11110xxx ...
incomplete = i < 4;
}
@@ -1040,232 +1082,185 @@ struct llama_server_context
break;
}
- if (!incomplete)
- {
- size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+ if (!incomplete) {
+ size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
- size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
- if (stop_pos != std::string::npos)
- {
+
+ size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
+ if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
- pos = std::min(slot.sent_count, slot.generated_text.size());
- }
- else
- {
+ pos = std::min(slot.n_sent_text, slot.generated_text.size());
+ } else {
is_stop_full = false;
- stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+ stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
}
// check if there is any token to predict
- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
- {
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
- slot.sent_count += result.text_to_send.size();
+ slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
+
slot.add_token_string(result);
- if (slot.params.stream)
- {
+ if (slot.params.stream) {
send_partial_response(slot, result);
}
}
- if (incomplete)
- {
+ if (incomplete) {
slot.has_next_token = true;
}
// check the limits
- if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
- {
- slot.stopped_limit = true;
+ if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+ slot.stopped_limit = true;
slot.has_next_token = false;
+
+ LOG_VERBOSE("stopped by limit", {
+ {"id_slot", slot.id},
+ {"n_decoded", slot.n_decoded},
+ {"n_predict", slot.params.n_predict},
+ });
}
- if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
- {
- slot.stopped_eos = true;
+ if (result.tok == llama_token_eos(model)) {
+ slot.stopped_eos = true;
slot.has_next_token = false;
+
LOG_VERBOSE("eos token found", {});
}
LOG_VERBOSE("next token", {
- {"token", result.tok},
- {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
- {"has_next_token", slot.has_next_token},
- {"n_remain", slot.n_remaining},
- {"num_tokens_predicted", slot.n_decoded},
- {"stopped_eos", slot.stopped_eos},
- {"stopped_word", slot.stopped_word},
- {"stopped_limit", slot.stopped_limit},
- {"stopping_word", slot.stopping_word},
- });
+ {"token", result.tok},
+ {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+ {"has_next_token", slot.has_next_token},
+ {"n_remain", slot.n_remaining},
+ {"n_decoded", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ });
return slot.has_next_token; // continue
}
- bool process_images(llama_client_slot &slot) const
- {
- for (slot_image &img : slot.images)
- {
- if (!img.request_encode_image)
- {
- continue;
- }
-
- if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
- LOG_TEE("Error processing the given image");
- return false;
- }
-
-
- img.request_encode_image = false;
- }
-
- return slot.images.size() > 0;
- }
-
- void send_error(task_server& task, const std::string &error)
- {
- LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
- task_result res;
- res.id = task.id;
- res.multitask_id = task.multitask_id;
- res.stop = false;
- res.error = true;
- res.result_json = { { "content", error } };
- queue_results.send(res);
- }
-
- static json error_to_json(const llama_error& error)
- {
- return {
- { "error", {
- { "id", error.id() },
- { "description", error.description() }
- } }
- };
- }
-
- void send_error(task_server& task, const llama_error& error)
- {
- LOG_TEE("task %i - error: %s - %s\n", task.id, error.id().c_str(), error.description().c_str());
- task_result res;
- res.id = task.id;
- res.multitask_id = task.multitask_id;
- res.stop = false;
- res.error = true;
- res.result_json = { { "content", error_to_json(error).dump() } };
- queue_results.send(res);
- }
-
- json get_formated_generation(llama_client_slot &slot)
- {
+ json get_formated_generation(const server_slot & slot) const {
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
- const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
- eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+ const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+
std::vector samplers_sequence;
- for (const auto &sampler_type : slot.sparams.samplers_sequence)
- {
+ samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
+ for (const auto & sampler_type : slot.sparams.samplers_sequence) {
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
}
return json {
- {"n_ctx", slot.n_ctx},
- {"n_predict", slot.n_predict},
- {"model", params.model_alias},
- {"seed", slot.params.seed},
- {"temperature", slot.sparams.temp},
- {"dynatemp_range", slot.sparams.dynatemp_range},
- {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
- {"top_k", slot.sparams.top_k},
- {"top_p", slot.sparams.top_p},
- {"min_p", slot.sparams.min_p},
- {"tfs_z", slot.sparams.tfs_z},
- {"typical_p", slot.sparams.typical_p},
- {"repeat_last_n", slot.sparams.penalty_last_n},
- {"repeat_penalty", slot.sparams.penalty_repeat},
- {"presence_penalty", slot.sparams.penalty_present},
- {"frequency_penalty", slot.sparams.penalty_freq},
- {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
+ {"n_ctx", slot.n_ctx},
+ {"n_predict", slot.n_predict},
+ {"model", params.model_alias},
+ {"seed", slot.params.seed},
+ {"temperature", slot.sparams.temp},
+ {"dynatemp_range", slot.sparams.dynatemp_range},
+ {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
+ {"top_k", slot.sparams.top_k},
+ {"top_p", slot.sparams.top_p},
+ {"min_p", slot.sparams.min_p},
+ {"tfs_z", slot.sparams.tfs_z},
+ {"typical_p", slot.sparams.typical_p},
+ {"repeat_last_n", slot.sparams.penalty_last_n},
+ {"repeat_penalty", slot.sparams.penalty_repeat},
+ {"presence_penalty", slot.sparams.penalty_present},
+ {"frequency_penalty", slot.sparams.penalty_freq},
+ {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
- {"mirostat", slot.sparams.mirostat},
- {"mirostat_tau", slot.sparams.mirostat_tau},
- {"mirostat_eta", slot.sparams.mirostat_eta},
- {"penalize_nl", slot.sparams.penalize_nl},
- {"stop", slot.params.antiprompt},
- {"n_predict", slot.params.n_predict},
- {"n_keep", params.n_keep},
- {"ignore_eos", ignore_eos},
- {"stream", slot.params.stream},
- {"logit_bias", slot.sparams.logit_bias},
- {"n_probs", slot.sparams.n_probs},
- {"min_keep", slot.sparams.min_keep},
- {"grammar", slot.sparams.grammar},
- {"samplers", samplers_sequence}
+ {"mirostat", slot.sparams.mirostat},
+ {"mirostat_tau", slot.sparams.mirostat_tau},
+ {"mirostat_eta", slot.sparams.mirostat_eta},
+ {"penalize_nl", slot.sparams.penalize_nl},
+ {"stop", slot.params.antiprompt},
+ {"n_predict", slot.params.n_predict},
+ {"n_keep", params.n_keep},
+ {"ignore_eos", ignore_eos},
+ {"stream", slot.params.stream},
+ {"logit_bias", slot.sparams.logit_bias},
+ {"n_probs", slot.sparams.n_probs},
+ {"min_keep", slot.sparams.min_keep},
+ {"grammar", slot.sparams.grammar},
+ {"samplers", samplers_sequence}
};
}
- void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = false;
+ void send_error(const server_task & task, const std::string & error) {
+ LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
- res.result_json = json
- {
+ server_task_result res;
+ res.id = task.id;
+ res.id_multi = task.id_multi;
+ res.stop = false;
+ res.error = true;
+ res.data = { { "content", error } };
+
+ queue_results.send(res);
+ }
+
+ void send_partial_response(server_slot & slot, completion_token_output tkn) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = false;
+ res.data = json {
{"content", tkn.text_to_send},
{"stop", false},
- {"slot_id", slot.id},
- {"multimodal", multimodal}
+ {"id_slot", slot.id},
+ {"multimodal", false}
};
- if (slot.sparams.n_probs > 0)
- {
- std::vector probs_output = {};
+ if (slot.sparams.n_probs > 0) {
const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
- size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
- size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
- if (probs_pos < probs_stop_pos)
- {
- probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+ const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
+ const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
+
+ std::vector probs_output;
+ if (probs_pos < probs_stop_pos) {
+ probs_output = std::vector(
+ slot.generated_token_probs.begin() + probs_pos,
+ slot.generated_token_probs.begin() + probs_stop_pos);
}
- slot.sent_token_probs_index = probs_stop_pos;
- res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+ slot.n_sent_token_probs = probs_stop_pos;
+
+ res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
}
- if (slot.oaicompat)
- {
- res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
- res.result_json["model"] = slot.oaicompat_model;
+ if (slot.oaicompat) {
+ res.data["oaicompat_token_ctr"] = slot.n_decoded;
+ res.data["model"] = slot.oaicompat_model;
}
queue_results.send(res);
}
- void send_final_response(llama_client_slot &slot)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = true;
-
- res.result_json = json
- {
+ void send_final_response(const server_slot & slot) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = true;
+ res.data = json {
{"content", !slot.params.stream ? slot.generated_text : ""},
- {"slot_id", slot.id},
+ {"id_slot", slot.id},
{"stop", true},
{"model", params.model_alias},
{"tokens_predicted", slot.n_decoded},
- {"tokens_evaluated", slot.num_prompt_tokens},
+ {"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)},
{"prompt", slot.prompt},
{"truncated", slot.truncated},
@@ -1277,80 +1272,87 @@ struct llama_server_context
{"timings", slot.get_formated_timings()}
};
- if (slot.sparams.n_probs > 0)
- {
- std::vector probs = {};
- if (!slot.params.stream && slot.stopped_word)
- {
+ if (slot.sparams.n_probs > 0) {
+ std::vector probs;
+ if (!slot.params.stream && slot.stopped_word) {
const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
- probs = std::vector(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
- }
- else
- {
+
probs = std::vector(
- slot.generated_token_probs.begin(),
- slot.generated_token_probs.end());
+ slot.generated_token_probs.begin(),
+ slot.generated_token_probs.end() - stop_word_toks.size());
+ } else {
+ probs = std::vector(
+ slot.generated_token_probs.begin(),
+ slot.generated_token_probs.end());
}
- res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+
+ res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
}
- if (slot.oaicompat)
- {
- res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
- res.result_json["model"] = slot.oaicompat_model;
+ if (slot.oaicompat) {
+ res.data["oaicompat_token_ctr"] = slot.n_decoded;
+ res.data["model"] = slot.oaicompat_model;
}
queue_results.send(res);
}
- void send_embedding(llama_client_slot &slot)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = true;
+ void send_embedding(const server_slot & slot, const llama_batch & batch) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = true;
const int n_embd = llama_n_embd(model);
- if (!params.embedding)
- {
- LOG_WARNING("embedding disabled", {
- {"params.embedding", params.embedding},
- });
- res.result_json = json
- {
- {"embedding", std::vector(n_embd, 0.0f)},
- };
- }
- else
- {
- const float *data = llama_get_embeddings(ctx);
- std::vector embedding(data, data + n_embd);
- res.result_json = json
- {
- {"embedding", embedding },
+
+ for (int i = 0; i < batch.n_tokens; ++i) {
+ if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+ continue;
+ }
+
+ const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+ if (embd == NULL) {
+ embd = llama_get_embeddings_ith(ctx, i);
+ }
+
+ if (embd == NULL) {
+ LOG_ERROR("failed to get embeddings", {
+ {"token", batch.token [i]},
+ {"seq_id", batch.seq_id[i][0]}
+ });
+
+ res.data = json {
+ {"embedding", std::vector(n_embd, 0.0f)},
+ };
+
+ continue;
+ }
+
+ res.data = json {
+ {"embedding", std::vector(embd, embd + n_embd)},
};
}
+
queue_results.send(res);
}
- void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
- {
- task_server task;
- task.id = task_id;
- task.target_id = 0;
- task.data = std::move(data);
- task.infill_mode = infill;
- task.embedding_mode = embedding;
- task.type = TASK_TYPE_COMPLETION;
- task.multitask_id = multitask_id;
+ void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding) {
+ server_task task;
+ task.id = id_task;
+ task.id_multi = id_multi;
+ task.id_target = 0;
+ task.data = std::move(data);
+ task.infill = infill;
+ task.embedding = embedding;
+ task.type = SERVER_TASK_TYPE_COMPLETION;
// when a completion task's prompt array is not a singleton, we split it into multiple requests
// otherwise, it's a single-prompt task, we actually queue it
// if there's numbers in the prompt array it will be treated as an array of tokens
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
bool numbers = false;
- for (const auto& e : task.data.at("prompt")) {
+ for (const auto & e : task.data.at("prompt")) {
if (e.is_number()) {
numbers = true;
break;
@@ -1365,92 +1367,23 @@ struct llama_server_context
if (numbers) {
queue_tasks.post(task);
} else {
- split_multiprompt_task(task_id, task);
+ split_multiprompt_task(id_task, task);
}
} else {
queue_tasks.post(task);
}
}
- // for multiple images processing
- bool ingest_images(llama_client_slot &slot, int n_batch)
- {
- int image_idx = 0;
+ void request_cancel(int id_task) {
+ server_task task;
+ task.type = SERVER_TASK_TYPE_CANCEL;
+ task.id_target = id_task;
- while (image_idx < (int) slot.images.size())
- {
- slot_image &img = slot.images[image_idx];
-
- // process prefix prompt
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
- {
- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
- llama_batch batch_view = {
- n_tokens,
- batch.token + i,
- nullptr,
- batch.pos + i,
- batch.n_seq_id + i,
- batch.seq_id + i,
- batch.logits + i,
- 0, 0, 0, // unused
- };
- if (llama_decode(ctx, batch_view))
- {
- LOG_TEE("%s : failed to eval\n", __func__);
- return false;
- }
- }
-
- // process image with llm
- for (int i = 0; i < img.image_tokens; i += n_batch)
- {
- int n_eval = img.image_tokens - i;
- if (n_eval > n_batch)
- {
- n_eval = n_batch;
- }
-
- const int n_embd = llama_n_embd(model);
- llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
- if (llama_decode(ctx, batch_img))
- {
- LOG_TEE("%s : failed to eval image\n", __func__);
- return false;
- }
- slot.n_past += n_eval;
- }
- image_idx++;
-
- llama_batch_clear(batch);
-
- // append prefix of next image
- const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
- slot.params.input_suffix : // no more images, then process suffix prompt
- (json)(slot.images[image_idx].prefix_prompt);
-
- std::vector append_tokens = tokenize(json_prompt, false); // has next image
- for (int i = 0; i < (int) append_tokens.size(); ++i)
- {
- llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
- slot.n_past += 1;
- }
- }
-
- return true;
- }
-
- void request_cancel(int task_id)
- {
- task_server task;
- task.type = TASK_TYPE_CANCEL;
- task.target_id = task_id;
queue_tasks.post(task);
}
- void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
- {
- int prompt_count = multiprompt_task.data.at("prompt").size();
+ void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) {
+ const int prompt_count = multiprompt_task.data.at("prompt").size();
if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
return;
@@ -1458,212 +1391,240 @@ struct llama_server_context
// generate all the ID for subtask
std::vector subtask_ids(prompt_count);
- for (int i = 0; i < prompt_count; i++)
- {
+ for (int i = 0; i < prompt_count; i++) {
subtask_ids[i] = queue_tasks.get_new_id();
}
// queue up the multitask so we can track its subtask progression
- queue_tasks.add_multitask(multitask_id, subtask_ids);
+ queue_tasks.add_multitask(id_multi, subtask_ids);
// add subtasks
- for (int i = 0; i < prompt_count; i++)
- {
+ for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
- request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+ request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
}
}
- void process_single_task(task_server& task)
- {
- switch (task.type)
- {
- case TASK_TYPE_COMPLETION: {
- llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
- if (slot == nullptr)
+ void process_single_task(const server_task & task) {
+ switch (task.type) {
+ case SERVER_TASK_TYPE_COMPLETION:
{
- // if no slot is available, we defer this task for processing later
- LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
- queue_tasks.defer(task);
- break;
- }
-
- if (task.data.contains("system_prompt"))
- {
- if (!all_slots_are_idle) {
- send_error(task, "system prompt can only be updated when all slots are idle");
+ server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+ if (slot == nullptr) {
+ // if no slot is available, we defer this task for processing later
+ LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
+ queue_tasks.defer(task);
break;
}
- process_system_prompt_data(task.data["system_prompt"]);
- // reset cache_tokens for all slots
- for (llama_client_slot &slot : slots)
- {
- slot.cache_tokens.clear();
- slot.n_past = 0;
- slot.n_past_se = 0;
+ if (task.data.contains("system_prompt")) {
+ system_prompt_set(task.data["system_prompt"]);
+
+ for (server_slot & slot : slots) {
+ slot.n_past = 0;
+ slot.n_past_se = 0;
+ }
}
- }
- slot->reset();
+ slot->reset();
- slot->infill = task.infill_mode;
- slot->embedding = task.embedding_mode;
- slot->task_id = task.id;
- slot->multitask_id = task.multitask_id;
+ slot->id_task = task.id;
+ slot->id_multi = task.id_multi;
+ slot->infill = task.infill;
+ slot->embedding = task.embedding;
- try {
- if (!launch_slot_with_data(slot, task.data))
- {
+ if (!launch_slot_with_data(*slot, task.data)) {
// send error result
send_error(task, "internal_error");
break;
}
- } catch (const llama_error & err) {
- send_error(task, err);
- }
- } break;
- case TASK_TYPE_CANCEL: { // release slot linked with the task id
- for (auto & slot : slots)
+ } break;
+ case SERVER_TASK_TYPE_CANCEL:
{
- if (slot.task_id == task.target_id)
- {
- slot.release();
- break;
+ // release slot linked with the task id
+ for (auto & slot : slots) {
+ if (slot.id_task == task.id_target) {
+ slot.release();
+ break;
+ }
}
- }
- } break;
- case TASK_TYPE_NEXT_RESPONSE: {
- // do nothing
- } break;
- case TASK_TYPE_METRICS: {
- json slots_data = json::array();
- int n_idle_slots = 0;
- int n_processing_slots = 0;
+ } break;
+ case SERVER_TASK_TYPE_NEXT_RESPONSE:
+ {
+ // do nothing
+ } break;
+ case SERVER_TASK_TYPE_METRICS:
+ {
+ json slots_data = json::array();
- for (llama_client_slot &slot: slots) {
- json slot_data = get_formated_generation(slot);
- slot_data["id"] = slot.id;
- slot_data["task_id"] = slot.task_id;
- slot_data["state"] = slot.state;
- slot_data["prompt"] = slot.prompt;
- slot_data["next_token"] = {
+ int n_idle_slots = 0;
+ int n_processing_slots = 0;
+
+ for (server_slot & slot : slots) {
+ json slot_data = get_formated_generation(slot);
+ slot_data["id"] = slot.id;
+ slot_data["id_task"] = slot.id_task;
+ slot_data["state"] = slot.state;
+ slot_data["prompt"] = slot.prompt;
+ slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
- {"n_remain", slot.n_remaining},
- {"num_tokens_predicted", slot.n_decoded},
- {"stopped_eos", slot.stopped_eos},
- {"stopped_word", slot.stopped_word},
- {"stopped_limit", slot.stopped_limit},
- {"stopping_word", slot.stopping_word},
- };
- if (slot_data["state"] == IDLE) {
- n_idle_slots++;
- } else {
- n_processing_slots++;
+ {"n_remain", slot.n_remaining},
+ {"n_decoded", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ };
+
+ if (slot_data["state"] == SLOT_STATE_IDLE) {
+ n_idle_slots++;
+ } else {
+ n_processing_slots++;
+ }
+
+ slots_data.push_back(slot_data);
}
- slots_data.push_back(slot_data);
- }
- LOG_INFO("slot data", {
- {"task_id", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots}
- });
- LOG_VERBOSE("slot data", {
- {"task_id", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots},
- {"slots", slots_data}
- });
- task_result res;
- res.id = task.id;
- res.multitask_id = task.multitask_id;
- res.stop = true;
- res.error = false;
- res.result_json = {
+ LOG_INFO("slot data", {
+ {"id_task", task.id},
+ {"n_idle_slots", n_idle_slots},
+ {"n_processing_slots", n_processing_slots}
+ });
+
+ LOG_VERBOSE("slot data", {
+ {"id_task", task.id},
+ {"n_idle_slots", n_idle_slots},
+ {"n_processing_slots", n_processing_slots},
+ {"slots", slots_data}
+ });
+
+ server_task_result res;
+ res.id = task.id;
+ res.id_multi = task.id_multi;
+ res.stop = true;
+ res.error = false;
+ res.data = {
{ "idle", n_idle_slots },
{ "processing", n_processing_slots },
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
+ { "t_start", metrics.t_start},
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
+ { "t_tokens_generation_total", metrics.t_tokens_generation_total},
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
+ { "t_prompt_processing_total", metrics.t_prompt_processing_total},
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
{ "t_prompt_processing", metrics.t_prompt_processing},
{ "n_tokens_predicted", metrics.n_tokens_predicted},
{ "t_tokens_generation", metrics.t_tokens_generation},
- { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
- { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
+ { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
+ { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
- { "slots", slots_data },
- };
- metrics.reset_bucket();
- queue_results.send(res);
- } break;
+ { "slots", slots_data },
+ };
+
+ if (json_value(task.data, "reset_bucket", false)) {
+ metrics.reset_bucket();
+ }
+ queue_results.send(res);
+ } break;
}
}
- void on_finish_multitask(task_multi& multitask)
- {
+ void on_finish_multitask(const server_task_multi & multitask) {
// all subtasks done == multitask is done
- task_result result;
- result.id = multitask.id;
- result.stop = true;
+ server_task_result result;
+ result.id = multitask.id;
+ result.stop = true;
result.error = false;
// collect json results into one json result
std::vector result_jsons;
- for (auto& subres : multitask.results)
- {
- result_jsons.push_back(subres.result_json);
+ for (const auto & subres : multitask.results) {
+ result_jsons.push_back(subres.data);
result.error = result.error && subres.error;
}
- result.result_json = json{ { "results", result_jsons } };
+ result.data = json {
+ { "results", result_jsons }
+ };
+
queue_results.send(result);
}
bool update_slots() {
- if (system_need_update)
- {
- LOG_INFO("updating system prompt", {});
- update_system_prompt();
+ if (system_need_update) {
+ system_prompt_update();
}
- llama_batch_clear(batch);
+ // release slots
+ for (auto & slot : slots) {
+ if (slot.command == SLOT_COMMAND_RELEASE) {
+ slot.state = SLOT_STATE_IDLE;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.t_last_used = ggml_time_us();
- if (all_slots_are_idle)
- {
- if (system_prompt.empty() && clean_kv_cache)
- {
- LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
- kv_cache_clear();
+ LOG_INFO("slot released", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", n_ctx},
+ {"n_past", slot.n_past},
+ {"n_system_tokens", system_tokens.size()},
+ {"n_cache_tokens", slot.cache_tokens.size()},
+ {"truncated", slot.truncated}
+ });
+
+ queue_tasks.notify_slot_changed();
}
- return true;
}
- LOG_VERBOSE("posting NEXT_RESPONSE", {});
- task_server task;
- task.type = TASK_TYPE_NEXT_RESPONSE;
- task.target_id = -1;
- queue_tasks.post(task);
-
- for (llama_client_slot &slot : slots)
+ // check if all slots are idle
{
- if (slot.ga_n == 1)
- {
- if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
- {
+ bool all_idle = true;
+
+ for (auto & slot : slots) {
+ if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
+ all_idle = false;
+ break;
+ }
+ }
+
+ if (all_idle) {
+ LOG_INFO("all slots are idle", {});
+ if (system_prompt.empty() && clean_kv_cache) {
+ kv_cache_clear();
+ }
+
+ return true;
+ }
+ }
+
+ {
+ LOG_VERBOSE("posting NEXT_RESPONSE", {});
+
+ server_task task;
+ task.type = SERVER_TASK_TYPE_NEXT_RESPONSE;
+ task.id_target = -1;
+
+ queue_tasks.post(task);
+ }
+
+ // apply context-shift if needed
+ // TODO: simplify and improve
+ for (server_slot & slot : slots) {
+ if (slot.ga_n == 1) {
+ if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = n_left / 2;
LOG_INFO("slot context shift", {
- {"slot_id", slot.id},
- {"task_id", slot.task_id},
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
{"n_keep", n_keep},
{"n_left", n_left},
{"n_discard", n_discard},
@@ -1672,15 +1633,17 @@ struct llama_server_context
{"n_system_tokens", system_tokens.size()},
{"n_cache_tokens", slot.cache_tokens.size()}
});
+
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
- for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
- {
- slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
- }
+ if (slot.params.cache_prompt) {
+ for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
+ slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+ }
- slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+ slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+ }
slot.n_past -= n_discard;
@@ -1689,33 +1652,12 @@ struct llama_server_context
}
}
- // decode any currently ongoing sequences
- LOG_VERBOSE("decoding ongoing sequences", {});
- for (auto & slot : slots)
- {
- // release the slot
- if (slot.command == RELEASE)
- {
- slot.state = IDLE;
- slot.command = NONE;
- slot.t_last_used = ggml_time_us();
+ // start populating the batch for this iteration
+ llama_batch_clear(batch);
- LOG_INFO("slot released", {
- {"slot_id", slot.id},
- {"task_id", slot.task_id},
- {"n_ctx", n_ctx},
- {"n_past", slot.n_past},
- {"n_system_tokens", system_tokens.size()},
- {"n_cache_tokens", slot.cache_tokens.size()},
- {"truncated", slot.truncated}
- });
- queue_tasks.notify_slot_changed();
-
- continue;
- }
-
- if (slot.state == IDLE)
- {
+ // frist, add sampled tokens from any ongoing sequences
+ for (auto & slot : slots) {
+ if (slot.state == SLOT_STATE_IDLE) {
continue;
}
@@ -1726,186 +1668,184 @@ struct llama_server_context
// TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+
slot.n_past += 1;
+
+ if (slot.params.cache_prompt) {
+ slot.cache_tokens.push_back(slot.sampled);
+ }
+
+ LOG_VERBOSE("slot decode token", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", n_ctx},
+ {"n_past", slot.n_past},
+ {"n_system_tokens", system_tokens.size()},
+ {"n_cache_tokens", slot.cache_tokens.size()},
+ {"truncated", slot.truncated}
+ });
}
// process in chunks of params.n_batch
int32_t n_batch = params.n_batch;
- // assign workload to the slots
- if (params.cont_batching || batch.n_tokens == 0)
- {
- for (auto & slot : slots)
- {
- const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty();
+ // next, batch any pending prompts without exceeding n_batch
+ if (params.cont_batching || batch.n_tokens == 0) {
+ for (auto & slot : slots) {
+ const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty());
// empty prompt passed -> release the slot and send empty response
// note: infill mode allows empty prompt
- if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
- {
+ if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT && !has_prompt && !slot.infill) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
slot.release();
slot.print_timings();
send_final_response(slot);
continue;
}
- // need process the prompt
- if (slot.state == IDLE && slot.command == LOAD_PROMPT)
- {
- slot.state = PROCESSING;
- slot.command = NONE;
- std::vector prompt_tokens;
- slot.t_start_process_prompt = ggml_time_us();
- slot.t_start_genereration = 0;
+ // this slot still has a prompt to be processed
+ if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
+ auto & prompt_tokens = slot.prompt_tokens;
- if (slot.infill)
- {
- bool suff_rm_leading_spc = true;
- if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
- {
- params.input_suffix.erase(0, 1);
- suff_rm_leading_spc = false;
- }
- auto prefix_tokens = tokenize(slot.params.input_prefix, false);
- auto suffix_tokens = tokenize(slot.params.input_suffix, false);
-
- const int space_token = 29871; // TODO: this should not be hardcoded
- if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
- suffix_tokens.erase(suffix_tokens.begin());
- }
-
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
- prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
- prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
- prefix_tokens.push_back(llama_token_middle(model));
- prompt_tokens = prefix_tokens;
- }
- else
- {
- prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
- }
-
- slot.num_prompt_tokens = prompt_tokens.size();
-
- if (slot.params.n_keep < 0)
- {
- slot.params.n_keep = slot.num_prompt_tokens;
- }
- slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
- // if input prompt is too big, truncate it
- if (slot.num_prompt_tokens >= slot.n_ctx)
- {
- const int n_left = slot.n_ctx - slot.params.n_keep;
- const int n_block_size = n_left / 2;
- const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
-
- std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
- new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-
- LOG_VERBOSE("input truncated", {
- {"n_ctx", slot.n_ctx},
- {"n_keep", slot.params.n_keep},
- {"n_left", n_left},
- {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+ // we haven't tokenized the prompt yet - do it now:
+ if (prompt_tokens.empty()) {
+ LOG_VERBOSE("tokenizing prompt", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task}
});
- slot.truncated = true;
- prompt_tokens = new_tokens;
- slot.num_prompt_tokens = prompt_tokens.size();
- GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
- }
+ slot.t_start_process_prompt = ggml_time_us();
+ slot.t_start_generation = 0;
- if (!slot.params.cache_prompt)
- {
- llama_sampling_reset(slot.ctx_sampling);
+ if (slot.infill) {
+ bool suff_rm_leading_spc = true;
+ if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+ params.input_suffix.erase(0, 1);
+ suff_rm_leading_spc = false;
+ }
+
+ auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+ auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+
+ const int space_token = 29871; // TODO: this should not be hardcoded
+ if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+ suffix_tokens.erase(suffix_tokens.begin());
+ }
+
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+ prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+ prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+ prefix_tokens.push_back(llama_token_middle(model));
+ prompt_tokens = prefix_tokens;
+ } else {
+ prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
+ }
slot.n_past = 0;
- slot.n_past_se = 0;
- slot.ga_i = 0;
- slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
- }
- else
- {
- // push the prompt into the sampling context (do not apply grammar)
- for (auto &token : prompt_tokens)
- {
- llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
- }
+ slot.n_prompt_tokens = prompt_tokens.size();
- slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-
- // the last token of the cache is not in the KV cache until the next call to llama_decode
- // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
- if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
- {
- slot.n_past -= 1;
- }
-
- slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
-
- if (slot.ga_n != 1)
- {
- int ga_i = 0;
- int32_t ga_n = slot.ga_n;
- int32_t ga_w = slot.ga_w;
- int32_t slot_npast = 0;
- for (int k = 0; k < slot.n_past; ++k)
- {
- while (slot_npast >= ga_i + ga_w) {
- const int bd = (ga_w/ga_n)*(ga_n - 1);
- slot_npast -= bd;
- ga_i += ga_w/ga_n;
- }
- slot_npast++;
+ if (slot.embedding) {
+ // this prompt is too large to process - discard it
+ if (slot.n_prompt_tokens > n_batch) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.release();
+ slot.print_timings();
+ send_final_response(slot);
+ continue;
+ }
+ } else {
+ if (slot.params.n_keep < 0) {
+ slot.params.n_keep = slot.n_prompt_tokens;
+ }
+ slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+
+ // if input prompt is too big, truncate it (if group attention self-extend is disabled)
+ if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
+ const int n_left = slot.n_ctx - slot.params.n_keep;
+
+ const int n_block_size = n_left / 2;
+ const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+
+ std::vector new_tokens(
+ prompt_tokens.begin(),
+ prompt_tokens.begin() + slot.params.n_keep);
+
+ new_tokens.insert(
+ new_tokens.end(),
+ prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+ prompt_tokens.end());
+
+ prompt_tokens = std::move(new_tokens);
+
+ slot.truncated = true;
+ slot.n_prompt_tokens = prompt_tokens.size();
+
+ LOG_VERBOSE("input truncated", {
+ {"n_ctx", slot.n_ctx},
+ {"n_keep", slot.params.n_keep},
+ {"n_left", n_left},
+ {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
+ });
+
+ GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
+ }
+
+ llama_sampling_reset(slot.ctx_sampling);
+
+ if (!slot.params.cache_prompt) {
+ slot.n_past_se = 0;
+ slot.ga_i = 0;
+ } else {
+ GGML_ASSERT(slot.ga_n == 1);
+
+ // reuse any previously computed tokens that are common with the new prompt
+ slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+
+ // remove the non-common part from the cache
+ slot.cache_tokens.resize(slot.n_past);
+
+ // push the prompt into the sampling context (do not apply grammar)
+ for (int i = 0; i < slot.n_past; ++i) {
+ llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
+ }
}
- slot.n_past_se = slot_npast;
- slot.ga_i = ga_i;
}
- LOG_INFO("slot progression", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id },
- { "n_past", slot.n_past },
- { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
- });
+ if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
+ // we have to evaluate at least 1 token to generate logits.
+ LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+ { "id_slot", slot.id },
+ { "id_task", slot.id_task }
+ });
+
+ slot.n_past--;
+ if (slot.ga_i > 0) {
+ slot.n_past_se--;
+ }
+ }
+
+ slot.n_prompt_tokens_processed = 0;
}
- slot.cache_tokens = prompt_tokens;
-
- if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
- {
- // we have to evaluate at least 1 token to generate logits.
- LOG_INFO("we have to evaluate at least 1 token to generate logits", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id }
- });
- slot.n_past--;
- if (slot.ga_i > 0)
- {
- slot.n_past_se--;
+ if (slot.embedding) {
+ // cannot fit the prompt in the current batch - will try next iter
+ if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+ continue;
}
}
- int p0 = (int) system_tokens.size() + slot.n_past;
- LOG_INFO("kv cache rm [p0, end)", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id },
- { "p0", p0 }
- });
+ const int p0 = (int) system_tokens.size() + slot.n_past;
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
- LOG_VERBOSE("prompt ingested", {
- {"n_past", slot.n_past},
- {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
- {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
- });
-
- const bool has_images = process_images(slot);
-
- // process the prefix of first image
- std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+ LOG_INFO("kv cache rm [p0, end)", {
+ { "id_slot", slot.id },
+ { "id_task", slot.id_task },
+ { "p0", p0 }
+ });
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
@@ -1913,61 +1853,82 @@ struct llama_server_context
int32_t ga_n = slot.ga_n;
int32_t ga_w = slot.ga_w;
- for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
- {
- if (slot.ga_n != 1)
- {
+ // add prompt tokens for processing in the current batch
+ // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow
+ for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
+ if (slot.ga_n != 1) {
while (slot_npast >= ga_i + ga_w) {
const int bd = (ga_w/ga_n)*(ga_n - 1);
slot_npast -= bd;
ga_i += ga_w/ga_n;
}
}
- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+
+ llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
+
+ if (slot.params.cache_prompt) {
+ slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
+ }
+
+ slot.n_prompt_tokens_processed++;
slot_npast++;
}
- if (has_images && !ingest_images(slot, n_batch))
- {
- LOG_ERROR("failed processing images", {
- "slot_id", slot.id,
- "task_id", slot.task_id,
- });
- // FIXME @phymbert: to be properly tested
- // early returning without changing the slot state will block the slot for ever
- // no one at the moment is checking the return value
- return false;
- }
+ LOG_VERBOSE("prompt processing progress", {
+ {"id_slot", slot.id},
+ {"n_past", slot.n_past},
+ {"n_ctx", n_ctx},
+ {"n_tokens", batch.n_tokens},
+ {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
+ });
- // extract the logits only for the last token
- if (batch.n_tokens > 0)
- {
+ // entire prompt has been processed - start decoding new tokens
+ if (slot.n_past == slot.n_prompt_tokens) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
+
+ GGML_ASSERT(batch.n_tokens > 0);
+
+ // extract the logits only for the last token
batch.logits[batch.n_tokens - 1] = true;
- }
- slot.n_decoded = 0;
- slot.i_batch = batch.n_tokens - 1;
+ slot.n_decoded = 0;
+ slot.i_batch = batch.n_tokens - 1;
+
+ LOG_VERBOSE("prompt done", {
+ {"id_slot", slot.id},
+ {"n_past", slot.n_past},
+ {"n_ctx", n_ctx},
+ {"n_tokens", batch.n_tokens},
+ });
+ }
+ }
+
+ if (batch.n_tokens >= n_batch) {
+ break;
}
}
}
- if (batch.n_tokens == 0)
- {
- all_slots_are_idle = true;
+ if (batch.n_tokens == 0) {
+ LOG_VERBOSE("no tokens to decode", {});
+
return true;
}
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
- {
- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+ LOG_VERBOSE("decoding batch", {
+ {"n_tokens", batch.n_tokens},
+ });
- for (auto & slot : slots)
- {
- if (slot.ga_n != 1)
- {
+ // process the created batch of tokens
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+ const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+
+ for (auto & slot : slots) {
+ if (slot.ga_n != 1) {
// context extension via Self-Extend
- while (slot.n_past_se >= slot.ga_i + slot.ga_w)
- {
+ // TODO: simplify and/or abstract this
+ while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
@@ -1978,8 +1939,8 @@ struct llama_server_context
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
- llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+ llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
+ llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd;
@@ -1987,12 +1948,12 @@ struct llama_server_context
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
}
+
slot.n_past_se += n_tokens;
}
}
- llama_batch batch_view =
- {
+ llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
@@ -2005,10 +1966,8 @@ struct llama_server_context
const int ret = llama_decode(ctx, batch_view);
- if (ret != 0)
- {
- if (n_batch == 1 || ret < 0)
- {
+ if (ret != 0) {
+ if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return false;
@@ -2019,20 +1978,18 @@ struct llama_server_context
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
+
continue;
}
- for (auto & slot : slots)
- {
- if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
- {
+ for (auto & slot : slots) {
+ if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
continue;
}
// prompt evaluated for embedding
- if (slot.embedding)
- {
- send_embedding(slot);
+ if (slot.embedding) {
+ send_embedding(slot, batch_view);
slot.release();
slot.i_batch = -1;
continue;
@@ -2044,10 +2001,9 @@ struct llama_server_context
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
slot.n_decoded += 1;
- if (slot.n_decoded == 1)
- {
- slot.t_start_genereration = ggml_time_us();
- slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+ if (slot.n_decoded == 1) {
+ slot.t_start_generation = ggml_time_us();
+ slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
metrics.on_prompt_eval(slot);
}
@@ -2055,19 +2011,19 @@ struct llama_server_context
result.tok = id;
const int32_t n_probs = slot.sparams.n_probs;
- if (slot.sparams.temp <= 0 && n_probs > 0)
- {
+ if (slot.sparams.temp <= 0 && n_probs > 0) {
// for llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &cur_p);
}
- for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
- {
- result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+ for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
+ result.probs.push_back({
+ cur_p.data[i].id,
+ cur_p.data[i].p
+ });
}
- if (!process_token(result, slot))
- {
+ if (!process_token(result, slot)) {
slot.release();
slot.print_timings();
send_final_response(slot);
@@ -2079,17 +2035,23 @@ struct llama_server_context
}
LOG_VERBOSE("slots updated", {});
+
return true;
}
- void run_on_all_tasks_finished() {
- update_slots();
+ json model_meta() const {
+ return json {
+ {"vocab_type", llama_vocab_type (model)},
+ {"n_vocab", llama_n_vocab (model)},
+ {"n_ctx_train", llama_n_ctx_train (model)},
+ {"n_embd", llama_n_embd (model)},
+ {"n_params", llama_model_n_params(model)},
+ {"size", llama_model_size (model)},
+ };
}
};
-static void server_print_usage(const char *argv0, const gpt_params ¶ms,
- const server_params &sparams)
-{
+static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
printf("usage: %s [options]\n", argv0);
printf("\n");
printf("options:\n");
@@ -2097,6 +2059,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
+ printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-scaling {none,linear,yarn}\n");
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
@@ -2106,15 +2069,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
+ printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
- if (llama_supports_mlock())
- {
+ if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
- if (llama_supports_mmap())
- {
+ if (llama_supports_mmap()) {
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
@@ -2146,7 +2108,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
- printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+ printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -spf FNAME, --system-prompt-file FNAME\n");
@@ -2155,7 +2117,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" KV cache data type for K (default: f16)\n");
printf(" -ctv TYPE, --cache-type-v TYPE\n");
printf(" KV cache data type for V (default: f16)\n");
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-format log output format: json or text (default: json)\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
@@ -2165,65 +2126,49 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
- printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
+ printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
+ printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n");
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
printf("\n");
}
-static void server_params_parse(int argc, char **argv, server_params &sparams,
- gpt_params ¶ms, llama_server_context& llama)
-{
- gpt_params default_params;
+static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
+ gpt_params default_params;
server_params default_sparams;
+
std::string arg;
bool invalid_param = false;
- for (int i = 1; i < argc; i++)
- {
+ for (int i = 1; i < argc; i++) {
arg = argv[i];
- if (arg == "--port")
- {
- if (++i >= argc)
- {
+ if (arg == "--port") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.port = std::stoi(argv[i]);
- }
- else if (arg == "--host")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--host") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.hostname = argv[i];
- }
- else if (arg == "--path")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--path") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.public_path = argv[i];
- }
- else if (arg == "--api-key")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--api-key") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.api_keys.emplace_back(argv[i]);
- }
- else if (arg == "--api-key-file")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--api-key-file") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2240,53 +2185,36 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
key_file.close();
- }
- else if (arg == "--timeout" || arg == "-to")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--timeout" || arg == "-to") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.read_timeout = std::stoi(argv[i]);
sparams.write_timeout = std::stoi(argv[i]);
- }
- else if (arg == "-m" || arg == "--model")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
- }
- else if (arg == "-a" || arg == "--alias")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.model_alias = argv[i];
- }
- else if (arg == "-h" || arg == "--help")
- {
+ } else if (arg == "-h" || arg == "--help") {
server_print_usage(argv[0], default_params, default_sparams);
exit(0);
- }
- else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
- }
- else if (arg == "--rope-scaling")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-scaling") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2295,150 +2223,126 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
else { invalid_param = true; break; }
- }
- else if (arg == "--rope-freq-base")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-freq-base") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
- }
- else if (arg == "--rope-freq-scale")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-freq-scale") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
- }
- else if (arg == "--yarn-ext-factor")
- {
+ } else if (arg == "--yarn-ext-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
}
- else if (arg == "--yarn-attn-factor")
- {
+ else if (arg == "--yarn-attn-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
- }
- else if (arg == "--yarn-beta-fast")
- {
+ } else if (arg == "--yarn-beta-fast") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
- }
- else if (arg == "--yarn-beta-slow")
- {
+ } else if (arg == "--yarn-beta-slow") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
- }
- else if (arg == "--threads" || arg == "-t")
- {
+ } else if (arg == "--pooling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+ else { invalid_param = true; break; }
+ } else if (arg == "--threads" || arg == "-t") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
- }
- else if (arg == "--grp-attn-n" || arg == "-gan")
- {
+ } else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_n = std::stoi(argv[i]);
- }
- else if (arg == "--grp-attn-w" || arg == "-gaw")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--grp-attn-w" || arg == "-gaw") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_w = std::stoi(argv[i]);
- }
- else if (arg == "--threads-batch" || arg == "-tb")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--threads-batch" || arg == "-tb") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
- }
- else if (arg == "-b" || arg == "--batch-size")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--threads-http") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ sparams.n_threads_http = std::stoi(argv[i]);
+ } else if (arg == "-b" || arg == "--batch-size") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
- params.n_batch = std::min(512, params.n_batch);
- }
- else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
if (llama_supports_gpu_offload()) {
params.n_gpu_layers = std::stoi(argv[i]);
} else {
- LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
- "See main README.md for information on enabling GPU BLAS support",
- {{"n_gpu_layers", params.n_gpu_layers}});
+ LOG_WARNING(
+ "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+ "See main README.md for information on enabling GPU BLAS support",
+ {{"n_gpu_layers", params.n_gpu_layers}});
}
- }
- else if (arg == "--split-mode" || arg == "-sm")
- {
+ } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string arg_next = argv[i];
- if (arg_next == "none")
- {
+ if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
- }
- else if (arg_next == "layer")
- {
+ } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
- }
- else if (arg_next == "row")
- {
+ } else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
- }
- else {
+ } else {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--tensor-split" || arg == "-ts")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--tensor-split" || arg == "-ts") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2451,33 +2355,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
std::vector split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= llama_max_devices());
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
- {
- if (i_device < split_arg.size())
- {
+ for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
+ if (i_device < split_arg.size()) {
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
- }
- else
- {
+ } else {
params.tensor_split[i_device] = 0.0f;
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--no-mul-mat-q" || arg == "-nommq")
- {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
- params.mul_mat_q = false;
-#else
- LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
-#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--main-gpu" || arg == "-mg")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--main-gpu" || arg == "-mg") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2486,98 +2375,70 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
#endif
- }
- else if (arg == "--lora")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false;
- }
- else if (arg == "--lora-scaled")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora-scaled") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
const char * lora_adapter = argv[i];
- if (++i >= argc)
- {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false;
- }
- else if (arg == "--lora-base")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora-base") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
- }
- else if (arg == "-v" || arg == "--verbose")
- {
+ } else if (arg == "-v" || arg == "--verbose") {
#if SERVER_VERBOSE != 1
LOG_WARNING("server.cpp is not built with verbose logging.", {});
#else
server_verbose = true;
#endif
- }
- else if (arg == "--mlock")
- {
+ } else if (arg == "--mlock") {
params.use_mlock = true;
- }
- else if (arg == "--no-mmap")
- {
+ } else if (arg == "--no-mmap") {
params.use_mmap = false;
- }
- else if (arg == "--numa") {
+ } else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
- }
- else if (arg == "--embedding")
- {
+ } else if (arg == "--embedding" || arg == "--embeddings") {
params.embedding = true;
- }
- else if (arg == "-cb" || arg == "--cont-batching")
- {
+ } else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true;
- }
- else if (arg == "-np" || arg == "--parallel")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-np" || arg == "--parallel") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parallel = std::stoi(argv[i]);
- } else if (arg == "-n" || arg == "--n-predict")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-n" || arg == "--n-predict") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
- } else if (arg == "-spf" || arg == "--system-prompt-file")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-spf" || arg == "--system-prompt-file") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2587,67 +2448,39 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
- std::string systm_content;
+ std::string system_prompt;
std::copy(
std::istreambuf_iterator(file),
std::istreambuf_iterator(),
- std::back_inserter(systm_content)
+ std::back_inserter(system_prompt)
);
- llama.process_system_prompt_data(json::parse(systm_content));
- }
- else if (arg == "-ctk" || arg == "--cache-type-k") {
+ sparams.system_prompt = system_prompt;
+ } else if (arg == "-ctk" || arg == "--cache-type-k") {
params.cache_type_k = argv[++i];
- }
- else if (arg == "-ctv" || arg == "--cache-type-v") {
+ } else if (arg == "-ctv" || arg == "--cache-type-v") {
params.cache_type_v = argv[++i];
- }
- else if(arg == "--mmproj")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--log-format") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
- params.mmproj = argv[i];
- }
- else if (arg == "--log-format")
- {
- if (++i >= argc)
- {
- invalid_param = true;
- break;
- }
- if (std::strcmp(argv[i], "json") == 0)
- {
+ if (std::strcmp(argv[i], "json") == 0) {
server_log_json = true;
- }
- else if (std::strcmp(argv[i], "text") == 0)
- {
+ } else if (std::strcmp(argv[i], "text") == 0) {
server_log_json = false;
- }
- else
- {
+ } else {
invalid_param = true;
break;
}
- }
- else if (arg == "--log-disable")
- {
+ } else if (arg == "--log-disable") {
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
- }
- else if (arg == "--slots-endpoint-disable")
- {
+ } else if (arg == "--slots-endpoint-disable") {
sparams.slots_endpoint = false;
- }
- else if (arg == "--metrics")
- {
+ } else if (arg == "--metrics") {
sparams.metrics_endpoint = true;
- }
- else if (arg == "--chat-template")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--chat-template") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2658,9 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
sparams.chat_template = argv[i];
- }
- else if (arg == "--override-kv")
- {
+ } else if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -2671,6 +2502,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
+
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
@@ -2701,67 +2533,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.kv_overrides.push_back(kvo);
- }
- else
- {
+ } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
+
if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0;
}
- if (invalid_param)
- {
+ if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
-/* llama.cpp completion api semantics */
-static json format_partial_response(
- llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs
-) {
- json res = json
- {
- {"content", content },
- {"stop", false},
- {"slot_id", slot->id },
- {"multimodal", llama.multimodal }
- };
-
- if (slot->sparams.n_probs > 0)
- {
- res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
- }
-
- return res;
-}
-
-static json format_tokenizer_response(const std::vector &tokens)
-{
- return json {
- {"tokens", tokens}
- };
-}
-
-static json format_detokenized_response(std::string content)
-{
- return json {
- {"content", content}
- };
-}
-
-
-static void log_server_request(const httplib::Request &req, const httplib::Response &res)
-{
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
// skip GH copilot requests when using default port
- if (req.path == "/v1/health" || req.path == "/v1/completions")
- {
+ if (req.path == "/v1/health" || req.path == "/v1/completions") {
return;
}
@@ -2780,31 +2573,9 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
});
}
-struct token_translator
-{
- llama_context * ctx;
- std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
- std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
-{
- auto & gtps = slot->generated_token_probs;
- auto translator = token_translator{llama.ctx};
- auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
- const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
- if (slot->generated_text.capacity() < slot->generated_text.size() + len)
- {
- slot->generated_text.reserve(slot->generated_text.size() + len);
- }
- for (const completion_token_output & cto : gtps)
- {
- slot->generated_text += translator(cto);
- }
-}
-
std::function shutdown_handler;
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+
inline void signal_handler(int signal) {
if (is_terminating.test_and_set()) {
// in case it hangs, we can force terminate the server by hitting Ctrl+C twice
@@ -2812,40 +2583,45 @@ inline void signal_handler(int signal) {
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
exit(1);
}
+
shutdown_handler(signal);
}
-int main(int argc, char **argv)
-{
+int main(int argc, char ** argv) {
#if SERVER_VERBOSE != 1
log_disable();
#endif
// own arguments required by this example
- gpt_params params;
+ gpt_params params;
server_params sparams;
// struct that contains llama context and inference
- llama_server_context llama;
+ server_context ctx_server;
- server_params_parse(argc, argv, sparams, params, llama);
+ server_params_parse(argc, argv, sparams, params);
- if (params.model_alias == "unknown")
- {
+ if (!sparams.system_prompt.empty()) {
+ ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
+ }
+
+ if (params.model_alias == "unknown") {
params.model_alias = params.model;
}
llama_backend_init();
llama_numa_init(params.numa);
- LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
- {"commit", LLAMA_COMMIT}});
+ LOG_INFO("build info", {
+ {"build", LLAMA_BUILD_NUMBER},
+ {"commit", LLAMA_COMMIT}
+ });
LOG_INFO("system info", {
- {"n_threads", params.n_threads},
- {"n_threads_batch", params.n_threads_batch},
- {"total_threads", std::thread::hardware_concurrency()},
- {"system_info", llama_print_system_info()},
- });
+ {"n_threads", params.n_threads},
+ {"n_threads_batch", params.n_threads_batch},
+ {"total_threads", std::thread::hardware_concurrency()},
+ {"system_info", llama_print_system_info()},
+ });
httplib::Server svr;
@@ -2854,158 +2630,182 @@ int main(int argc, char **argv)
svr.set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
- svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ svr.Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res.set_header("Access-Control-Allow-Credentials", "true");
- res.set_header("Access-Control-Allow-Methods", "POST");
- res.set_header("Access-Control-Allow-Headers", "*");
+ res.set_header("Access-Control-Allow-Methods", "POST");
+ res.set_header("Access-Control-Allow-Headers", "*");
});
- svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
+ svr.Get("/health", [&](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load();
- switch(current_state) {
- case SERVER_STATE_READY: {
- // request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
+ switch (current_state) {
+ case SERVER_STATE_READY:
+ {
+ // request slots data using task queue
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.type = SERVER_TASK_TYPE_METRICS;
+ task.id_target = -1;
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
- // get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
- int n_idle_slots = result.result_json["idle"];
- int n_processing_slots = result.result_json["processing"];
+ const int n_idle_slots = result.data["idle"];
+ const int n_processing_slots = result.data["processing"];
- json health = {
+ json health = {
{"status", "ok"},
{"slots_idle", n_idle_slots},
- {"slots_processing", n_processing_slots}};
- res.status = 200; // HTTP OK
- if (sparams.slots_endpoint && req.has_param("include_slots")) {
- health["slots"] = result.result_json["slots"];
- }
+ {"slots_processing", n_processing_slots}
+ };
- if (n_idle_slots == 0) {
- health["status"] = "no slot available";
- if (req.has_param("fail_on_no_slot")) {
- res.status = 503; // HTTP Service Unavailable
+ res.status = 200; // HTTP OK
+ if (sparams.slots_endpoint && req.has_param("include_slots")) {
+ health["slots"] = result.data["slots"];
}
+
+ if (n_idle_slots == 0) {
+ health["status"] = "no slot available";
+ if (req.has_param("fail_on_no_slot")) {
+ res.status = 503; // HTTP Service Unavailable
+ }
+ }
+
+ res.set_content(health.dump(), "application/json");
+ break;
}
- res.set_content(health.dump(), "application/json");
- break;
- }
case SERVER_STATE_LOADING_MODEL:
- res.set_content(R"({"status": "loading model"})", "application/json");
- res.status = 503; // HTTP Service Unavailable
- break;
+ {
+ res.set_content(R"({"status": "loading model"})", "application/json");
+ res.status = 503; // HTTP Service Unavailable
+ } break;
case SERVER_STATE_ERROR:
- res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
- res.status = 500; // HTTP Internal Server Error
- break;
+ {
+ res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
+ res.status = 500; // HTTP Internal Server Error
+ } break;
}
});
if (sparams.slots_endpoint) {
- svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+ svr.Get("/slots", [&](const httplib::Request &, httplib::Response & res) {
// request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.id_multi = -1;
+ task.id_target = -1;
+ task.type = SERVER_TASK_TYPE_METRICS;
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
// get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
- res.set_content(result.result_json["slots"].dump(), "application/json");
+ res.set_content(result.data["slots"].dump(), "application/json");
res.status = 200; // HTTP OK
});
}
if (sparams.metrics_endpoint) {
- svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
+ svr.Get("/metrics", [&](const httplib::Request &, httplib::Response & res) {
// request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.id_multi = -1;
+ task.id_target = -1;
+ task.type = SERVER_TASK_TYPE_METRICS;
+ task.data.push_back({{"reset_bucket", true}});
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
// get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
- json data = result.result_json;
+ json data = result.data;
- uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
- uint64_t t_prompt_processing = data["t_prompt_processing"];
+ const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
+ const uint64_t t_prompt_processing = data["t_prompt_processing"];
- uint64_t n_tokens_predicted = data["n_tokens_predicted"];
- uint64_t t_tokens_generation = data["t_tokens_generation"];
+ const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
+ const uint64_t t_tokens_generation = data["t_tokens_generation"];
- int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
+ const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
json all_metrics_def = json {
- {"counter", {{
- {"name", "prompt_tokens_total"},
- {"help", "Number of prompt tokens processed."},
- {"value", data["n_prompt_tokens_processed_total"]}
- }, {
- {"name", "tokens_predicted_total"},
- {"help", "Number of generation tokens processed."},
- {"value", data["n_tokens_predicted_total"]}
- }}},
- {"gauge", {{
- {"name", "prompt_tokens_seconds"},
- {"help", "Average prompt throughput in tokens/s."},
- {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
- },{
- {"name", "predicted_tokens_seconds"},
- {"help", "Average generation throughput in tokens/s."},
- {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
- },{
- {"name", "kv_cache_usage_ratio"},
- {"help", "KV-cache usage. 1 means 100 percent usage."},
- {"value", 1. * kv_cache_used_cells / params.n_ctx}
- },{
- {"name", "kv_cache_tokens"},
- {"help", "KV-cache tokens."},
- {"value", data["kv_cache_tokens_count"]}
- },{
- {"name", "requests_processing"},
- {"help", "Number of request processing."},
- {"value", data["processing"]}
- },{
- {"name", "requests_deferred"},
- {"help", "Number of request deferred."},
- {"value", data["deferred"]}
- }}}
+ {"counter", {{
+ {"name", "prompt_tokens_total"},
+ {"help", "Number of prompt tokens processed."},
+ {"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
+ }, {
+ {"name", "prompt_seconds_total"},
+ {"help", "Prompt process time"},
+ {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
+ }, {
+ {"name", "tokens_predicted_total"},
+ {"help", "Number of generation tokens processed."},
+ {"value", (uint64_t) data["n_tokens_predicted_total"]}
+ }, {
+ {"name", "tokens_predicted_seconds_total"},
+ {"help", "Predict process time"},
+ {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
+ }}},
+ {"gauge", {{
+ {"name", "prompt_tokens_seconds"},
+ {"help", "Average prompt throughput in tokens/s."},
+ {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
+ },{
+ {"name", "predicted_tokens_seconds"},
+ {"help", "Average generation throughput in tokens/s."},
+ {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
+ },{
+ {"name", "kv_cache_usage_ratio"},
+ {"help", "KV-cache usage. 1 means 100 percent usage."},
+ {"value", 1. * kv_cache_used_cells / params.n_ctx}
+ },{
+ {"name", "kv_cache_tokens"},
+ {"help", "KV-cache tokens."},
+ {"value", (uint64_t) data["kv_cache_tokens_count"]}
+ },{
+ {"name", "requests_processing"},
+ {"help", "Number of request processing."},
+ {"value", (uint64_t) data["processing"]}
+ },{
+ {"name", "requests_deferred"},
+ {"help", "Number of request deferred."},
+ {"value", (uint64_t) data["deferred"]}
+ }}}
};
std::stringstream prometheus;
- for (const auto& el : all_metrics_def.items()) {
- const auto& type = el.key();
- const auto& metrics_def = el.value();
- for (const auto& metric_def : metrics_def) {
- std::string name = metric_def["name"];
- std::string help = metric_def["help"];
- prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
- << "# TYPE llamacpp:" << name << " " << type << "\n"
- << "llamacpp:" << name << " " << metric_def["value"] << "\n";
+
+ for (const auto & el : all_metrics_def.items()) {
+ const auto & type = el.key();
+ const auto & metrics_def = el.value();
+
+ for (const auto & metric_def : metrics_def) {
+ const std::string name = metric_def["name"];
+ const std::string help = metric_def["help"];
+
+ auto value = json_value(metric_def, "value", 0.);
+ prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
+ << "# TYPE llamacpp:" << name << " " << type << "\n"
+ << "llamacpp:" << name << " " << value << "\n";
}
}
+ const int64_t t_start = data["t_start"];
+ res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
+
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
res.status = 200; // HTTP OK
});
@@ -3013,49 +2813,39 @@ int main(int argc, char **argv)
svr.set_logger(log_server_request);
- svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
- {
- const char fmt[] = "500 Internal Server Error\n%s";
- char buf[BUFSIZ];
- try
- {
- std::rethrow_exception(std::move(ep));
- }
- catch (std::exception &e)
- {
- snprintf(buf, sizeof(buf), fmt, e.what());
- }
- catch (...)
- {
- snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
- }
- res.set_content(buf, "text/plain; charset=utf-8");
- res.status = 500;
- });
+ svr.set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
+ const char fmt[] = "500 Internal Server Error\n%s";
- svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
- {
- if (res.status == 401)
- {
- res.set_content("Unauthorized", "text/plain; charset=utf-8");
- }
- if (res.status == 400)
- {
- res.set_content("Invalid request", "text/plain; charset=utf-8");
- }
- else if (res.status == 404)
- {
- res.set_content("File Not Found", "text/plain; charset=utf-8");
- res.status = 404;
- }
- });
+ char buf[BUFSIZ];
+ try {
+ std::rethrow_exception(std::move(ep));
+ } catch (std::exception &e) {
+ snprintf(buf, sizeof(buf), fmt, e.what());
+ } catch (...) {
+ snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+ }
+
+ res.set_content(buf, "text/plain; charset=utf-8");
+ res.status = 500;
+ });
+
+ svr.set_error_handler([](const httplib::Request &, httplib::Response & res) {
+ if (res.status == 401) {
+ res.set_content("Unauthorized", "text/plain; charset=utf-8");
+ }
+ if (res.status == 400) {
+ res.set_content("Invalid request", "text/plain; charset=utf-8");
+ }
+ if (res.status == 404) {
+ res.set_content("File Not Found", "text/plain; charset=utf-8");
+ }
+ });
// set timeouts and change hostname and port
svr.set_read_timeout (sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
- if (!svr.bind_to_port(sparams.hostname, sparams.port))
- {
+ if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
return 1;
}
@@ -3064,8 +2854,9 @@ int main(int argc, char **argv)
svr.set_base_dir(sparams.public_path);
std::unordered_map log_data;
+
log_data["hostname"] = sparams.hostname;
- log_data["port"] = std::to_string(sparams.port);
+ log_data["port"] = std::to_string(sparams.port);
if (sparams.api_keys.size() == 1) {
log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
@@ -3074,19 +2865,23 @@ int main(int argc, char **argv)
}
// load the model
- if (!llama.load_model(params))
- {
+ if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
return 1;
} else {
- llama.initialize();
+ ctx_server.initialize();
state.store(SERVER_STATE_READY);
- LOG_INFO("model loaded", {});
}
+ LOG_INFO("model loaded", {});
+
+ const auto model_meta = ctx_server.model_meta();
+
if (sparams.chat_template.empty()) { // custom chat template is not supplied
- // check if the template comes with the model is supported by us
- llama.validate_model_chat_template(sparams);
+ if (!ctx_server.validate_model_chat_template()) {
+ LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+ sparams.chat_template = "chatml";
+ }
}
// Middleware for API key validation
@@ -3098,6 +2893,7 @@ int main(int argc, char **argv)
// Check for API key in the header
auto auth_header = req.get_header_value("Authorization");
+
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
@@ -3116,185 +2912,177 @@ int main(int argc, char **argv)
};
// this is only called if no index.html is found in the public --path
- svr.Get("/", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8");
- return false;
- });
+ svr.Get("/", [](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8");
+ return false;
+ });
// this is only called if no index.js is found in the public --path
- svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8");
- return false;
- });
+ svr.Get("/index.js", [](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8");
+ return false;
+ });
// this is only called if no index.html is found in the public --path
- svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
- return false;
- });
+ svr.Get("/completion.js", [](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
+ return false;
+ });
// this is only called if no index.html is found in the public --path
- svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
- return false;
- });
+ svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
+ return false;
+ });
- svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- json data = {
- { "user_name", llama.name_user.c_str() },
- { "assistant_name", llama.name_assistant.c_str() },
- { "default_generation_settings", llama.default_generation_settings_for_props },
- { "total_slots", llama.params.n_parallel }
- };
- res.set_content(data.dump(), "application/json; charset=utf-8");
- });
+ svr.Get("/props", [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ json data = {
+ { "user_name", ctx_server.name_user.c_str() },
+ { "assistant_name", ctx_server.name_assistant.c_str() },
+ { "default_generation_settings", ctx_server.default_generation_settings_for_props },
+ { "total_slots", ctx_server.params.n_parallel }
+ };
- svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- if (!validate_api_key(req, res)) {
- return;
- }
- json data;
- try {
- data = json::parse(req.body);
- } catch(const json::exception & json_err) {
- const auto err = llama_error("request.invalid_json", std::string("Invalid JSON: ") + json_err.what());
- const auto err_json = llama_server_context::error_to_json(err).dump();
- res.set_content(err_json, "text/plain; charset=utf-8");
- return;
- }
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, false, false, -1);
- if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error && result.stop) {
- res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
- }
- else
- {
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
- }
- llama.queue_results.remove_waiting_task_id(task_id);
- } else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
- {
- while (true)
- {
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error) {
- const std::string str =
- "data: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- if (result.stop) {
- break;
- }
- } else {
- const std::string str =
- "error: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- break;
- }
- }
+ res.set_content(data.dump(), "application/json; charset=utf-8");
+ });
- llama.queue_results.remove_waiting_task_id(task_id);
- sink.done();
- return true;
- };
-
- auto on_complete = [task_id, &llama] (bool)
- {
- // cancel
- llama.request_cancel(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
- };
-
- res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
- }
- });
-
- svr.Get("/v1/models", [¶ms](const httplib::Request& req, httplib::Response& res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- std::time_t t = std::time(0);
-
- json models = {
- {"object", "list"},
- {"data", {
- {
- {"id", params.model_alias},
- {"object", "model"},
- {"created", t},
- {"owned_by", "llamacpp"}
- },
- }}
- };
-
- res.set_content(models.dump(), "application/json; charset=utf-8");
- });
-
- const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
- {
+ const auto completions = [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
- json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, false, false, -1);
+ json data = json::parse(req.body);
+
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, false, false);
if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
-
+ server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
- json oaicompat_result = format_final_response_oaicompat(data, result);
-
- res.set_content(oaicompat_result.dump(-1, ' ', false,
- json::error_handler_t::replace),
- "application/json; charset=utf-8");
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} else {
res.status = 500;
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
+ res.set_content(result.data["content"], "text/plain; charset=utf-8");
}
- llama.queue_results.remove_waiting_task_id(task_id);
- } else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
- while (true) {
- task_result llama_result = llama.queue_results.recv(task_id);
- if (!llama_result.error) {
- std::vector result_array = format_partial_response_oaicompat( llama_result);
- for (auto it = result_array.begin(); it != result_array.end(); ++it)
- {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ } else {
+ const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
+ while (true) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ const std::string str =
+ "data: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
+
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
+
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
+ }
+
+ if (result.stop) {
+ break;
+ }
+ } else {
+ const std::string str =
+ "error: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
+
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
+
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
+ }
+
+ break;
+ }
+ }
+
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ sink.done();
+
+ return true;
+ };
+
+ auto on_complete = [id_task, &ctx_server] (bool) {
+ // cancel
+ ctx_server.request_cancel(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ };
+
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+ }
+ };
+
+ svr.Post("/completion", completions); // legacy
+ svr.Post("/completions", completions);
+ svr.Post("/v1/completions", completions);
+
+ svr.Get("/v1/models", [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+
+ json models = {
+ {"object", "list"},
+ {"data", {
+ {
+ {"id", params.model_alias},
+ {"object", "model"},
+ {"created", std::time(0)},
+ {"owned_by", "llamacpp"},
+ {"meta", model_meta}
+ },
+ }}
+ };
+
+ res.set_content(models.dump(), "application/json; charset=utf-8");
+ });
+
+ const auto chat_completions = [&ctx_server, &validate_api_key, &sparams](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!validate_api_key(req, res)) {
+ return;
+ }
+
+ json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
+
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, false, false);
+
+ if (!json_value(data, "stream", false)) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+
+ if (!result.error && result.stop) {
+ json result_oai = format_final_response_oaicompat(data, result.data);
+
+ res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ } else {
+ res.status = 500;
+ res.set_content(result.data["content"], "text/plain; charset=utf-8");
+ }
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ } else {
+ const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
+ while (true) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ std::vector result_array = format_partial_response_oaicompat(result.data);
+
+ for (auto it = result_array.begin(); it != result_array.end(); ++it) {
if (!it->empty()) {
const std::string str =
"data: " +
@@ -3302,281 +3090,271 @@ int main(int argc, char **argv)
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return false;
}
}
}
- if (llama_result.stop) {
+ if (result.stop) {
break;
}
} else {
const std::string str =
"error: " +
- llama_result.result_json.dump(-1, ' ', false,
- json::error_handler_t::replace) +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return false;
}
break;
}
}
sink.done();
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return true;
};
- auto on_complete = [task_id, &llama](bool) {
+ auto on_complete = [id_task, &ctx_server](bool) {
// cancel request
- llama.request_cancel(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.request_cancel(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
};
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
}
};
- svr.Post("/chat/completions", chat_completions);
+ svr.Post("/chat/completions", chat_completions);
svr.Post("/v1/chat/completions", chat_completions);
- svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- if (!validate_api_key(req, res)) {
- return;
- }
- json data = json::parse(req.body);
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, true, false, -1);
- if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error && result.stop)
- {
- res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
- }
- else
- {
- res.status = 404;
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
- }
- llama.queue_results.remove_waiting_task_id(task_id);
- } else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
- while (true)
- {
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error) {
- const std::string str =
- "data: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- if (result.stop)
- {
- break;
- }
- }
- else
- {
- break;
- }
- }
-
- llama.queue_results.remove_waiting_task_id(task_id);
- sink.done();
- return true;
- };
-
- auto on_complete = [task_id, &llama] (bool)
- {
- // cancel
- llama.request_cancel(task_id);
- };
-
- res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
- }
- });
-
- svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
- { return res.set_content("", "application/json; charset=utf-8"); });
-
- svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- std::vector tokens;
- if (body.count("content") != 0)
- {
- tokens = llama.tokenize(body["content"], false);
- }
- const json data = format_tokenizer_response(tokens);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
- });
-
- svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- std::string content;
- if (body.count("tokens") != 0)
- {
- const std::vector tokens = body["tokens"];
- content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
- }
-
- const json data = format_detokenized_response(content);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
- });
-
- svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- json prompt;
- if (body.count("content") != 0)
- {
- prompt = body["content"];
- }
- else
- {
- prompt = "";
- }
-
- json image_data;
- if (body.count("image_data") != 0) {
- image_data = body["image_data"];
- }
- else
- {
- image_data = "";
- }
-
- // create and queue the task
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
-
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
-
- // send the result
- return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
- });
-
- svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
-
- json prompt;
- if (body.count("input") != 0)
- {
- prompt = body["input"];
- // batch
- if(prompt.is_array()) {
- json data = json::array();
- int i = 0;
- for (const json &elem : prompt) {
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
-
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
-
- json embedding = json{
- {"embedding", json_value(result.result_json, "embedding", json::array())},
- {"index", i++},
- {"object", "embedding"}
- };
- data.push_back(embedding);
- }
- json result = format_embeddings_response_oaicompat(body, data);
- return res.set_content(result.dump(), "application/json; charset=utf-8");
- }
- }
- else
- {
- prompt = "";
- }
-
- // create and queue the task
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
-
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
-
- json data = json::array({json{
- {"embedding", json_value(result.result_json, "embedding", json::array())},
- {"index", 0},
- {"object", "embedding"}
- }}
- );
-
- json root = format_embeddings_response_oaicompat(body, data);
-
- // send the result
- return res.set_content(root.dump(), "application/json; charset=utf-8");
- });
-
- // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
- // "Bus error: 10" - this is on macOS, it does not crash on Linux
- //std::thread t2([&]()
- /*{
- bool running = true;
- while (running)
- {
- running = llama.update_slots();
+ svr.Post("/infill", [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!validate_api_key(req, res)) {
+ return;
}
- }*/
- //);
+
+ json data = json::parse(req.body);
+
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, true, false);
+
+ if (!json_value(data, "stream", false)) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error && result.stop) {
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ } else {
+ res.status = 404;
+ res.set_content(result.data["content"], "text/plain; charset=utf-8");
+ }
+
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ } else {
+ const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
+ while (true) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ const std::string str =
+ "data: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
+
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
+
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
+ }
+
+ if (result.stop) {
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ sink.done();
+
+ return true;
+ };
+
+ auto on_complete = [id_task, &ctx_server] (bool) {
+ ctx_server.request_cancel(id_task);
+ };
+
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+ }
+ });
+
+ svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) {
+ return res.set_content("", "application/json; charset=utf-8");
+ });
+
+ svr.Post("/tokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ const json body = json::parse(req.body);
+
+ std::vector tokens;
+ if (body.count("content") != 0) {
+ tokens = ctx_server.tokenize(body["content"], false);
+ }
+ const json data = format_tokenizer_response(tokens);
+ return res.set_content(data.dump(), "application/json; charset=utf-8");
+ });
+
+ svr.Post("/detokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ const json body = json::parse(req.body);
+
+ std::string content;
+ if (body.count("tokens") != 0) {
+ const std::vector tokens = body["tokens"];
+ content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
+ }
+
+ const json data = format_detokenized_response(content);
+ return res.set_content(data.dump(), "application/json; charset=utf-8");
+ });
+
+ svr.Post("/embedding", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!params.embedding) {
+ res.status = 501;
+ res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
+ return;
+ }
+
+ const json body = json::parse(req.body);
+
+ json prompt;
+ if (body.count("content") != 0) {
+ prompt = body["content"];
+ } else {
+ prompt = "";
+ }
+
+ // create and queue the task
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0} }, false, true);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+ // send the result
+ return res.set_content(result.data.dump(), "application/json; charset=utf-8");
+ });
+
+ svr.Post("/v1/embeddings", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!params.embedding) {
+ res.status = 501;
+ res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
+ return;
+ }
+
+ const json body = json::parse(req.body);
+
+ json prompt;
+ if (body.count("input") != 0) {
+ prompt = body["input"];
+ if (prompt.is_array()) {
+ json data = json::array();
+
+ int i = 0;
+ for (const json & elem : prompt) {
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, { {"prompt", elem}, { "n_predict", 0} }, false, true);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+ json embedding = json{
+ {"embedding", json_value(result.data, "embedding", json::array())},
+ {"index", i++},
+ {"object", "embedding"}
+ };
+
+ data.push_back(embedding);
+ }
+
+ json result = format_embeddings_response_oaicompat(body, data);
+
+ return res.set_content(result.dump(), "application/json; charset=utf-8");
+ }
+ } else {
+ prompt = "";
+ }
+
+ // create and queue the task
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+ json data = json::array({json{
+ {"embedding", json_value(result.data, "embedding", json::array())},
+ {"index", 0},
+ {"object", "embedding"}
+ }}
+ );
+
+ json root = format_embeddings_response_oaicompat(body, data);
+
+ return res.set_content(root.dump(), "application/json; charset=utf-8");
+ });
+
+ if (sparams.n_threads_http < 1) {
+ // +2 threads for monitoring endpoints
+ sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+ }
+ log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
+ svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
LOG_INFO("HTTP server listening", log_data);
+
// run the HTTP server in a thread - see comment below
- std::thread t([&]()
- {
- if (!svr.listen_after_bind())
- {
- state.store(SERVER_STATE_ERROR);
- return 1;
- }
+ std::thread t([&]() {
+ if (!svr.listen_after_bind()) {
+ state.store(SERVER_STATE_ERROR);
+ return 1;
+ }
- return 0;
- });
+ return 0;
+ });
- llama.queue_tasks.on_new_task(std::bind(
- &llama_server_context::process_single_task, &llama, std::placeholders::_1));
- llama.queue_tasks.on_finish_multitask(std::bind(
- &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
- llama.queue_tasks.on_all_tasks_finished(std::bind(
- &llama_server_context::run_on_all_tasks_finished, &llama));
- llama.queue_results.on_multitask_update(std::bind(
- &llama_server_queue::update_multitask,
- &llama.queue_tasks,
+ ctx_server.queue_tasks.on_new_task(std::bind(
+ &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_finish_multitask(std::bind(
+ &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_run_slots(std::bind(
+ &server_context::update_slots, &ctx_server));
+ ctx_server.queue_results.on_multitask_update(std::bind(
+ &server_queue::update_multitask,
+ &ctx_server.queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
shutdown_handler = [&](int) {
- llama.queue_tasks.terminate();
+ ctx_server.queue_tasks.terminate();
};
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -3591,10 +3369,13 @@ int main(int argc, char **argv)
};
SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true);
#endif
- llama.queue_tasks.start_loop();
+
+ ctx_server.queue_tasks.start_loop();
+
svr.stop();
t.join();
llama_backend_free();
+
return 0;
}
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 0b9fdc4e7..95a0353b6 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -1,22 +1,30 @@
# Server tests
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
- * [issues.feature](./features/issues.feature) Pending issues scenario
- * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
- * [security.feature](./features/security.feature) Security, CORS and API Key
- * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
+Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
+and [behave](https://behave.readthedocs.io/en/latest/):
+
+* [issues.feature](./features/issues.feature) Pending issues scenario
+* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
+* [security.feature](./features/security.feature) Security, CORS and API Key
+* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
Tests target GitHub workflows job runners with 4 vCPU.
-Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
+Requests are
+using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
+based http client.
-Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
+Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
+To mitigate it, you can increase values in `n_predict`, `kv_size`.
### Install dependencies
+
`pip install -r requirements.txt`
### Run tests
+
1. Build the server
+
```shell
cd ../../..
mkdir build
@@ -24,24 +32,36 @@ cd build
cmake ../
cmake --build . --target server
```
-2. download required models:
- 1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
-3. Start the test: `./tests.sh`
+
+2. Start the test: `./tests.sh`
It's possible to override some scenario steps values with environment variables:
- - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
- - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
- - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
- - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
+
+| variable | description |
+|--------------------------|------------------------------------------------------------------------------------------------|
+| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
+| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
+| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
+| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
+| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
### Run @bug, @wip or @wrong_usage annotated scenario
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
+
- `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress
+- `@slow` heavy test, disabled by default
To run a scenario annotated with `@bug`, start:
-`DEBUG=ON ./tests.sh --no-skipped --tags bug`
+
+```shell
+DEBUG=ON ./tests.sh --no-skipped --tags bug
+```
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
+
+```shell
+./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
+```
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
new file mode 100644
index 000000000..b47661e94
--- /dev/null
+++ b/examples/server/tests/features/embeddings.feature
@@ -0,0 +1,94 @@
+@llama.cpp
+@embeddings
+Feature: llama.cpp server
+
+ Background: Server startup
+ Given a server listening on localhost:8080
+ And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+ And a model alias bert-bge-small
+ And 42 as server seed
+ And 2 slots
+ And 1024 as batch size
+ And 2048 KV cache size
+ And embeddings extraction
+ Then the server is starting
+ Then the server is healthy
+
+ Scenario: Embedding
+ When embeddings are computed for:
+ """
+ What is the capital of Bulgaria ?
+ """
+ Then embeddings are generated
+
+ Scenario: OAI Embeddings compatibility
+ Given a model bert-bge-small
+ When an OAI compatible embeddings computation request for:
+ """
+ What is the capital of Spain ?
+ """
+ Then embeddings are generated
+
+ Scenario: OAI Embeddings compatibility with multiple inputs
+ Given a model bert-bge-small
+ Given a prompt:
+ """
+ In which country Paris is located ?
+ """
+ And a prompt:
+ """
+ Is Madrid the capital of Spain ?
+ """
+ When an OAI compatible embeddings computation request for multiple inputs
+ Then embeddings are generated
+
+ Scenario: Multi users embeddings
+ Given a prompt:
+ """
+ Write a very long story about AI.
+ """
+ And a prompt:
+ """
+ Write another very long music lyrics.
+ """
+ And a prompt:
+ """
+ Write a very long poem.
+ """
+ And a prompt:
+ """
+ Write a very long joke.
+ """
+ Given concurrent embedding requests
+ Then the server is busy
+ Then the server is idle
+ Then all embeddings are generated
+
+ Scenario: Multi users OAI compatibility embeddings
+ Given a prompt:
+ """
+ In which country Paris is located ?
+ """
+ And a prompt:
+ """
+ Is Madrid the capital of Spain ?
+ """
+ And a prompt:
+ """
+ What is the biggest US city ?
+ """
+ And a prompt:
+ """
+ What is the capital of Bulgaria ?
+ """
+ And a model bert-bge-small
+ Given concurrent OAI embedding requests
+ Then the server is busy
+ Then the server is idle
+ Then all embeddings are generated
+
+ Scenario: All embeddings should be the same
+ Given 10 fixed prompts
+ And a model bert-bge-small
+ Given concurrent OAI embedding requests
+ Then all embeddings are the same
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 09e826747..9fd330db6 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -7,7 +7,10 @@ from signal import SIGKILL
def before_scenario(context, scenario):
- print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
+ context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
+ if context.debug:
+ print("DEBUG=ON\n")
+ print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
port = 8080
if 'PORT' in os.environ:
port = int(os.environ['PORT'])
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
index bf5a175a3..7b13e44ca 100644
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@@ -1,4 +1,5 @@
# List of ongoing issues
+# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
@bug
Feature: Issues
# No confirmed issue at the moment
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index 5f895cf90..066698c8e 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -1,14 +1,14 @@
@llama.cpp
+@parallel
Feature: Parallel
Background: Server startup
Given a server listening on localhost:8080
- And a model file stories260K.gguf
- And a model alias tinyllama-2
+ And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And 42 as server seed
+ And 512 as batch size
And 64 KV cache size
And 2 slots
- And embeddings extraction
And continuous batching
Then the server is starting
Then the server is healthy
@@ -98,48 +98,3 @@ Feature: Parallel
Then the server is busy
Then the server is idle
Then all prompts are predicted
-
- Scenario: Multi users embeddings
- Given a prompt:
- """
- Write a very long story about AI.
- """
- And a prompt:
- """
- Write another very long music lyrics.
- """
- And a prompt:
- """
- Write a very long poem.
- """
- And a prompt:
- """
- Write a very long joke.
- """
- Given concurrent embedding requests
- Then the server is busy
- Then the server is idle
- Then all embeddings are generated
-
- Scenario: Multi users OAI compatibility embeddings
- Given a prompt:
- """
- In which country Paris is located ?
- """
- And a prompt:
- """
- Is Madrid the capital of Spain ?
- """
- And a prompt:
- """
- What is the biggest US city ?
- """
- And a prompt:
- """
- What is the capital of Bulgaria ?
- """
- And a model tinyllama-2
- Given concurrent OAI embedding requests
- Then the server is busy
- Then the server is idle
- Then all embeddings are generated
diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature
new file mode 100644
index 000000000..1bde7aab8
--- /dev/null
+++ b/examples/server/tests/features/passkey.feature
@@ -0,0 +1,55 @@
+# run with: ./tests.sh --no-skipped --tags passkey
+@passkey
+@slow
+Feature: Passkey / Self-extend with context shift
+
+ Background: Server startup
+ Given a server listening on localhost:8080
+
+ # Generates a long text of junk and inserts a secret passkey number inside it.
+ # Then we query the LLM for the secret passkey.
+ # see #3856 and #4810
+ Scenario Outline: Passkey
+ Given a model file from HF repo
+ And as batch size
+ And as number of junk
+ And server max tokens to predict
+ And 42 as seed
+ And KV cache size
+ And 1 slots
+ And group attention factor to extend context size through self-extend
+ And group attention width to extend context size through self-extend
+ # Can be override with N_GPU_LAYERS
+ And GPU offloaded layers
+ Then the server is starting
+ Then the server is healthy
+ Given available models
+ Then model 0 is trained on tokens context
+ Given a prefix prompt:
+ """
+ here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
+ """
+ And a passkey prompt template:
+ """
+ The pass key is Remember it. is the pass key.
+ """
+ And a junk suffix prompt:
+ """
+ The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
+ """
+ And a suffix prompt:
+ """
+ What is the pass key? The pass key is
+ """
+ Given a "" passkey challenge prompt with the passkey inserted every junk
+ And a completion request with no api error
+ Then tokens are predicted matching
+
+ Examples:
+ | hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
+ | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 |
+ | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b |
+ #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
+ #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
+ # 987 |
+
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
index db06d3977..42a6709a5 100644
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -1,9 +1,10 @@
@llama.cpp
+@security
Feature: Security
Background: Server startup with an api key defined
Given a server listening on localhost:8080
- And a model file stories260K.gguf
+ And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a server api key llama.cpp
Then the server is starting
Then the server is healthy
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index b571582a7..878ac1363 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -1,15 +1,17 @@
@llama.cpp
+@server
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
- And a model file stories260K.gguf
+ And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model alias tinyllama-2
And 42 as server seed
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
# see --ctx-size and #5568
And 32 KV cache size
+ And 512 as batch size
And 1 slots
And embeddings extraction
And 32 server max tokens to predict
@@ -27,11 +29,12 @@ Feature: llama.cpp server
And a completion request with no api error
Then tokens are predicted matching
And prometheus metrics are exposed
+ And metric llamacpp:tokens_predicted is
Examples: Prompts
- | prompt | n_predict | re_content | n_predicted |
- | I believe the meaning of life is | 8 | (readgoing)+ | 8 |
- | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 |
+ | prompt | n_predict | re_content | n_predicted |
+ | I believe the meaning of life is | 8 | (read\|going)+ | 8 |
+ | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
Scenario Outline: OAI Compatibility
Given a model
@@ -43,38 +46,9 @@ Feature: llama.cpp server
Then tokens are predicted matching
Examples: Prompts
- | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
- | llama-2 | Book | What is the best book | 8 | (Momwhat)+ | 8 | disabled |
- | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thankshappybird)+ | 32 | enabled |
-
- Scenario: Embedding
- When embeddings are computed for:
- """
- What is the capital of Bulgaria ?
- """
- Then embeddings are generated
-
- Scenario: OAI Embeddings compatibility
- Given a model tinyllama-2
- When an OAI compatible embeddings computation request for:
- """
- What is the capital of Spain ?
- """
- Then embeddings are generated
-
- Scenario: OAI Embeddings compatibility with multiple inputs
- Given a model tinyllama-2
- Given a prompt:
- """
- In which country Paris is located ?
- """
- And a prompt:
- """
- Is Madrid the capital of Spain ?
- """
- When an OAI compatible embeddings computation request for multiple inputs
- Then embeddings are generated
-
+ | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
+ | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
+ | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
Scenario: Tokenize / Detokenize
When tokenizing:
@@ -82,3 +56,9 @@ Feature: llama.cpp server
What is the capital of France ?
"""
Then tokens can be detokenize
+
+ Scenario: Models available
+ Given available models
+ Then 1 models are supported
+ Then model 0 is identified by tinyllama-2
+ Then model 0 is trained on 128 tokens context
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 381da105e..d7f005836 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -10,9 +10,11 @@ from contextlib import closing
from re import RegexFlag
import aiohttp
+import numpy as np
import openai
from behave import step
from behave.api.async_step import async_run_until_complete
+from huggingface_hub import hf_hub_download
from prometheus_client import parser
@@ -23,20 +25,30 @@ def step_server_config(context, server_fqdn, server_port):
if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT'])
print(f"$PORT set, overriding server port with to {context.server_port}")
+ if 'FQDN' in os.environ:
+ context.server_fqdn = os.environ['FQDN']
+ print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
- context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
context.model_alias = None
+ context.n_batch = None
context.n_ctx = None
+ context.n_ga = None
+ context.n_ga_w = None
+ context.n_gpu_layer = None
context.n_predict = None
+ context.n_prompts = 0
context.n_server_predict = None
context.n_slots = None
+ context.prompt_prefix = None
+ context.prompt_suffix = None
context.server_api_key = None
context.server_continuous_batching = False
context.server_embeddings = False
context.server_metrics = False
context.server_process = None
+ context.seed = None
context.server_seed = None
context.user_api_key = None
@@ -45,9 +57,11 @@ def step_server_config(context, server_fqdn, server_port):
context.prompts = []
-@step(u'a model file {model_file}')
-def step_model_file(context, model_file):
- context.model_file = model_file
+@step(u'a model file {hf_file} from HF repo {hf_repo}')
+def step_download_hf_model(context, hf_file, hf_repo):
+ context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
+ if context.debug:
+ print(f"model file: {context.model_file}\n")
@step(u'a model alias {model_alias}')
@@ -55,24 +69,34 @@ def step_model_alias(context, model_alias):
context.model_alias = model_alias
-@step(u'{seed} as server seed')
+@step(u'{seed:d} as server seed')
def step_seed(context, seed):
- context.server_seed = int(seed)
+ context.server_seed = seed
-@step(u'{n_ctx} KV cache size')
+@step(u'{ngl:d} GPU offloaded layers')
+def step_n_gpu_layer(context, ngl):
+ if 'N_GPU_LAYERS' in os.environ:
+ new_ngl = int(os.environ['N_GPU_LAYERS'])
+ if context.debug:
+ print(f"-ngl upgraded from {ngl} to {new_ngl}")
+ ngl = new_ngl
+ context.n_gpu_layer = ngl
+
+
+@step(u'{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx):
- context.n_ctx = int(n_ctx)
+ context.n_ctx = n_ctx
-@step(u'{n_slots} slots')
+@step(u'{n_slots:d} slots')
def step_n_slots(context, n_slots):
- context.n_slots = int(n_slots)
+ context.n_slots = n_slots
-@step(u'{n_predict} server max tokens to predict')
+@step(u'{n_predict:d} server max tokens to predict')
def step_server_n_predict(context, n_predict):
- context.n_server_predict = int(n_predict)
+ context.n_server_predict = n_predict
@step(u'continuous batching')
@@ -116,11 +140,13 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok',
+ timeout=10,
params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=context.n_slots,
slots_processing=0,
expected_slots=[{'id': slot_id, 'state': 0}
- for slot_id in range(context.n_slots)])
+ for slot_id in
+ range(context.n_slots if context.n_slots else 1)])
case 'busy':
await wait_for_health_status(context, context.base_url, 503,
'no slot available',
@@ -128,7 +154,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
slots_idle=0,
slots_processing=context.n_slots,
expected_slots=[{'id': slot_id, 'state': 1}
- for slot_id in range(context.n_slots)])
+ for slot_id in
+ range(context.n_slots if context.n_slots else 1)])
case _:
assert False, "unknown status"
@@ -157,29 +184,30 @@ async def step_request_completion(context, api_error):
context.base_url,
debug=context.debug,
n_predict=context.n_predict,
- server_seed=context.server_seed,
+ seed=await completions_seed(context),
expect_api_error=expect_api_error,
user_api_key=context.user_api_key)
context.tasks_result.append(completion)
if context.debug:
- print(f"Completion response: {completion}")
+ print(f"Completion response: {completion}\n")
if expect_api_error:
assert completion == 401, f"completion must be an 401 status code: {completion}"
-@step(u'{predicted_n} tokens are predicted matching {re_content}')
+@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
- assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content)
+ assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
-@step(u'{predicted_n} tokens are predicted')
+@step(u'{predicted_n:d} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n):
- assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n))
+ assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
@step(u'a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt):
context.prompts.append(user_prompt)
+ context.n_prompts = len(context.prompts)
@step(u'a system prompt {system_prompt}')
@@ -192,9 +220,9 @@ def step_model(context, model):
context.model = model
-@step(u'{max_tokens} max tokens to predict')
+@step(u'{max_tokens:d} max tokens to predict')
def step_max_tokens(context, max_tokens):
- context.n_predict = int(max_tokens)
+ context.n_predict = max_tokens
@step(u'streaming is {enable_streaming}')
@@ -222,11 +250,77 @@ def step_server_api_key(context, server_api_key):
context.server_api_key = server_api_key
+@step(u'{n_junk:d} as number of junk')
+def step_n_junk(context, n_junk):
+ context.n_junk = n_junk
+
+
+@step(u'{n_batch:d} as batch size')
+def step_n_batch(context, n_batch):
+ context.n_batch = n_batch
+
+
+@step(u'{seed:d} as seed')
+def step_seed(context, seed):
+ context.seed = seed
+
+
+@step(u'a prefix prompt')
+def step_prompt_prefix(context):
+ context.prompt_prefix = context.text
+
+
+@step(u'a junk suffix prompt')
+def step_prompt_junk_suffix(context):
+ context.prompt_junk_suffix = context.text
+
+
+@step(u'a suffix prompt')
+def step_prompt_suffix(context):
+ context.prompt_suffix = context.text
+
+
+@step(u'{n_ga:d} group attention factor'
+ u' to extend context size through self-extend')
+def step_impl(context, n_ga):
+ context.n_ga = n_ga
+
+
+@step(u'{n_ga_w:d} group attention width to extend context size through self-extend')
+def step_impl(context, n_ga_w):
+ context.n_ga_w = n_ga_w
+
+
+@step(u'a passkey prompt template')
+def step_prompt_passkey(context):
+ context.prompt_passkey = context.text
+
+
+@step(u'{n_prompts:d} fixed prompts')
+def step_fixed_prompts(context, n_prompts):
+ context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
+ context.n_prompts = n_prompts
+
+
+@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
+def step_prompt_passkey(context, passkey, i_pos):
+ prompt = ""
+ for i in range(context.n_junk):
+ if i % context.n_junk == i_pos:
+ prompt += context.prompt_passkey # the passkey is already substituted
+ prompt += context.prompt_junk_suffix
+ if context.debug:
+ passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
+ print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
+ context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
+ context.n_prompts = len(context.prompts)
+
+
@step(u'an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete
async def step_oai_chat_completions(context, api_error):
if context.debug:
- print(f"Submitting OAI compatible completions request...")
+ print(f"Submitting OAI compatible completions request...\n")
expect_api_error = api_error == 'raised'
completion = await oai_chat_completions(context.prompts.pop(),
context.system_prompt,
@@ -241,8 +335,7 @@ async def step_oai_chat_completions(context, api_error):
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
- server_seed=context.server_seed
- if hasattr(context, 'server_seed') else None,
+ seed=await completions_seed(context),
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None,
@@ -261,11 +354,13 @@ async def step_oai_chat_completions(context, api_error):
@step(u'a prompt')
def step_a_prompt(context):
context.prompts.append(context.text)
+ context.n_prompts = len(context.prompts)
@step(u'a prompt {prompt}')
def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt)
+ context.n_prompts = len(context.prompts)
@step(u'concurrent completion requests')
@@ -276,8 +371,10 @@ async def step_concurrent_completion_requests(context):
# prompt is inserted automatically
context.base_url,
debug=context.debug,
+ prompt_prefix=context.prompt_prefix,
+ prompt_suffix=context.prompt_suffix,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
- server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
+ seed=await completions_seed(context),
user_api_key=context.user_api_key if hasattr(context,
'user_api_key') else None)
@@ -297,8 +394,7 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
- server_seed=context.server_seed
- if hasattr(context, 'server_seed') else None,
+ seed=await completions_seed(context),
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@@ -318,7 +414,9 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
- server_seed=context.server_seed
+ seed=context.seed
+ if hasattr(context, 'seed') else
+ context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@@ -330,11 +428,10 @@ async def step_all_prompts_are_predicted(context):
await all_prompts_are_predicted(context)
-@step(u'all prompts are predicted with {n_predict} tokens')
+@step(u'all prompts are predicted with {n_expected_predicted:d} tokens')
@async_run_until_complete
-async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict):
- expected_predicted_n = int(n_predict)
- await all_prompts_are_predicted(context, expected_predicted_n)
+async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
+ await all_prompts_are_predicted(context, n_expected_predicted)
async def all_prompts_are_predicted(context, expected_predicted_n=None):
@@ -348,25 +445,47 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@step(u'embeddings are computed for')
@async_run_until_complete
async def step_compute_embedding(context):
+ context.n_prompts = 1
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
+@step(u'all embeddings are the same')
+@async_run_until_complete
+async def step_all_embeddings_are_the_same(context):
+ n_embedding_requests = await gather_tasks_results(context)
+ assert n_embedding_requests > 0
+ embeddings = []
+ for i in range(n_embedding_requests):
+ embedding = context.tasks_result.pop().pop()
+ embeddings.append(embedding)
+ assert_embeddings(embedding)
+ n = len(embeddings)
+ for i in range(n-1):
+ for j in range(i+1, n):
+ embedding1 = np.array(embeddings[i])
+ embedding2 = np.array(embeddings[j])
+ if context.debug:
+ print(f"embedding1: {embedding1[-8:]}\n")
+ print(f"embedding2: {embedding2[-8:]}\n")
+ similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+ msg = f"Similarity between {i} and {j}: {similarity:.10f}"
+ if context.debug:
+ print(f"{msg}\n")
+ assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
+
@step(u'embeddings are generated')
def step_assert_embeddings(context):
- if len(context.prompts) == 0:
- assert_embeddings(context.embeddings)
- else:
- assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
- f"context.prompts={context.prompts}\n"
- f"context.embeddings={context.embeddings}")
- for embedding in context.embeddings:
- context.prompts.pop()
- assert_embeddings(embedding)
+ assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
+ f"context.n_prompts={context.n_prompts}\n"
+ f"context.embeddings={context.embeddings}")
+ for embedding in context.embeddings:
+ assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for')
@async_run_until_complete
async def step_oai_compute_embeddings(context):
+ context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context.text,
base_url=context.base_url,
user_api_key=context.user_api_key,
@@ -380,6 +499,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
+ context.prompts.clear()
@step(u'concurrent embedding requests')
@@ -406,9 +526,9 @@ async def step_concurrent_oai_embedding_requests(context):
@async_run_until_complete()
async def all_embeddings_are_generated(context):
n_embedding_requests = await gather_tasks_results(context)
- assert n_embedding_requests > 0
+ assert n_embedding_requests == context.n_prompts
for i in range(n_embedding_requests):
- assert_embeddings(context.tasks_result.pop())
+ assert_embeddings(context.tasks_result.pop().pop())
@step(u'tokenizing')
@@ -464,20 +584,63 @@ async def step_prometheus_metrics_exported(context):
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
metrics_raw = await metrics_response.text()
metric_exported = False
+ if context.debug:
+ print(f"/metrics answer:\n{metrics_raw}\n")
+ context.metrics = {}
for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name:
case "llamacpp:kv_cache_usage_ratio":
assert len(metric.samples) > 0
metric_exported = True
+ context.metrics[metric.name] = metric
+ assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
assert metric_exported, "No metrics exported"
-async def concurrent_requests(context, f_completion, *args, **kwargs):
- n_prompts = len(context.prompts)
+@step(u'metric {metric_name} is {metric_value:d}')
+def step_assert_metric_value(context, metric_name, metric_value):
+ if metric_name not in context.metrics:
+ assert False, f"no metric {metric_name} in {context.metrics.keys()}"
+ assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
+
+
+@step(u'available models')
+def step_available_models(context):
+ # openai client always expects an api_key
+ openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
+ openai.api_base = f'{context.base_url}/v1'
+ context.models = openai.Model.list().data
+
+
+@step(u'{n_model:d} models are supported')
+def step_supported_models(context, n_model):
if context.debug:
- print(f"starting {n_prompts} concurrent completion requests...")
- assert n_prompts > 0
- for prompt_no in range(n_prompts):
+ print("server models available:", context.models)
+ assert len(context.models) == n_model
+
+
+@step(u'model {i_model:d} is {param} {preposition} {param_value}')
+def step_supported_models(context, i_model, param, preposition, param_value):
+ assert i_model < len(context.models)
+ model = context.models[i_model]
+
+ param_value = param_value.split(' ', 1)[0]
+ match param:
+ case 'identified':
+ value = model.id
+ case 'trained':
+ value = str(model.meta.n_ctx_train)
+ case _:
+ assert False, "param {param} not supported"
+ assert param_value == value, f"model param {param} {value} != {param_value}"
+
+
+async def concurrent_requests(context, f_completion, *args, **kwargs):
+ context.n_prompts = len(context.prompts)
+ if context.debug:
+ print(f"starting {context.n_prompts} concurrent completion requests...")
+ assert context.n_prompts > 0
+ for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1)
@@ -486,8 +649,10 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
async def request_completion(prompt,
base_url,
debug=False,
+ prompt_prefix=None,
+ prompt_suffix=None,
n_predict=None,
- server_seed=None,
+ seed=None,
expect_api_error=None,
user_api_key=None):
if debug:
@@ -504,11 +669,14 @@ async def request_completion(prompt,
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/completion',
json={
+ "input_prefix": prompt_prefix,
"prompt": prompt,
- "n_predict": int(n_predict) if n_predict is not None else -1,
- "seed": server_seed if server_seed is not None else 42
+ "input_suffix": prompt_suffix,
+ "n_predict": n_predict if n_predict is not None else -1,
+ "seed": seed if seed is not None else 42
},
- headers=headers) as response:
+ headers=headers,
+ timeout=3600) as response:
if expect_api_error is None or not expect_api_error:
assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin
@@ -526,14 +694,14 @@ async def oai_chat_completions(user_prompt,
model=None,
n_predict=None,
enable_streaming=None,
- server_seed=None,
+ seed=None,
user_api_key=None,
expect_api_error=None):
if debug:
print(f"Sending OAI Chat completions request: {user_prompt}")
# openai client always expects an api key
user_api_key = user_api_key if user_api_key is not None else 'nope'
- seed = server_seed if server_seed is not None else 42
+ seed = seed if seed is not None else 42
enable_streaming = enable_streaming if enable_streaming is not None else False
payload = {
"messages": [
@@ -645,7 +813,7 @@ async def request_embedding(content, base_url=None):
}) as response:
assert response.status == 200
response_json = await response.json()
- return response_json['embedding']
+ return [response_json['embedding']]
async def request_oai_embeddings(input,
@@ -655,6 +823,7 @@ async def request_oai_embeddings(input,
user_api_key = user_api_key if user_api_key is not None else 'nope'
if async_client:
origin = 'llama.cpp'
+ headers=[]
if user_api_key is not None:
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session:
@@ -663,14 +832,21 @@ async def request_oai_embeddings(input,
"input": input,
"model": model,
},
- headers=headers) as response:
+ headers=headers,
+ timeout=3600) as response:
assert response.status == 200, f"received status code not expected: {response.status}"
assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
response_json = await response.json()
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
assert response_json['object'] == 'list'
- return response_json['data']
+ if isinstance(input, collections.abc.Sequence):
+ embeddings = []
+ for an_oai_embeddings in response_json['data']:
+ embeddings.append(an_oai_embeddings['embedding'])
+ else:
+ embeddings = [response_json['data']['embedding']]
+ return embeddings
else:
openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1'
@@ -684,7 +860,7 @@ async def request_oai_embeddings(input,
for an_oai_embeddings in oai_embeddings.data:
embeddings.append(an_oai_embeddings.embedding)
else:
- embeddings = oai_embeddings.data.embedding
+ embeddings = [oai_embeddings.data.embedding]
return embeddings
@@ -692,20 +868,31 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
content = completion_response['content']
n_predicted = completion_response['timings']['predicted_n']
assert len(content) > 0, "no token predicted"
- if expected_predicted_n is not None:
+ if re_content is not None:
+ p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
+ matches = p.finditer(content)
+ last_match = 0
+ highlighted = ''
+ for match in matches:
+ start, end = match.span()
+ highlighted += content[last_match: start]
+ highlighted += '\x1b[33m'
+ highlighted += content[start: end]
+ highlighted += '\x1b[0m'
+ last_match = end
+ highlighted += content[last_match:]
+ if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+ print(f"Checking completion response: {highlighted}\n")
+ assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
+ if expected_predicted_n and expected_predicted_n > 0:
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}')
- if re_content is not None:
- re_content = '^.*' + re_content.replace('', '|') + '.*$'
- assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
- f'invalid tokens predicted:'
- f' ```\n{content}\n``` do not match /{re_content}/')
async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
if context.debug:
- print(f"Waiting for all {n_tasks} tasks results...")
+ print(f"Waiting for all {n_tasks} tasks results...\n")
for task_no in range(n_tasks):
context.tasks_result.append(await context.concurrent_tasks.pop())
n_completions = len(context.tasks_result)
@@ -716,15 +903,13 @@ async def wait_for_health_status(context,
base_url,
expected_http_status_code,
expected_health_status,
+ timeout=3,
params=None,
slots_idle=None,
slots_processing=None,
expected_slots=None):
if context.debug:
- print(f"Starting checking for health for expected_health_status={expected_health_status}")
- timeout = 3 # seconds
- if expected_health_status == 'ok':
- timeout = 10 # CI slow inference
+ print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
interval = 0.5
counter = 0
async with aiohttp.ClientSession() as session:
@@ -734,7 +919,7 @@ async def wait_for_health_status(context,
health = await health_response.json()
if context.debug:
print(f"HEALTH - response for expected health status='{expected_health_status}' on "
- f"'{base_url}/health'?{params} is {health}")
+ f"'{base_url}/health'?{params} is {health}\n")
if (status_code == expected_http_status_code
and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle)
@@ -757,7 +942,7 @@ async def wait_for_health_status(context,
if expected_http_status_code == 503:
if len(context.tasks_result) == 0:
print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
- " busy health check missed, probably too fast inference\x1b[0m")
+ " busy health check missed, probably too fast inference\x1b[0m\n")
n_completions = await gather_tasks_results(context)
if n_completions > 0:
return
@@ -769,6 +954,8 @@ def assert_embeddings(embeddings):
assert len(embeddings) > 0
embeddings_computed = False
for emb in embeddings:
+ if not isinstance(emb, float):
+ assert False, f"Bad embeddings: {embeddings}"
if emb != 0:
embeddings_computed = True
assert embeddings_computed, f"Embeddings: {embeddings}"
@@ -791,6 +978,11 @@ def assert_slots_status(slots, expected_slots):
f" = {expected[key]} != {slot[key]}")
+async def completions_seed(context):
+ return context.seed if hasattr(context, 'seed') and context.seed is not None \
+ else context.server_seed if hasattr(context, 'server_seed') else None
+
+
def start_server_background(context):
context.server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
@@ -800,27 +992,35 @@ def start_server_background(context):
'--port', context.server_port,
'--model', context.model_file
]
+ if context.n_batch:
+ server_args.extend(['--batch-size', context.n_batch])
+ if context.n_gpu_layer:
+ server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching:
server_args.append('--cont-batching')
if context.server_embeddings:
server_args.append('--embedding')
if context.server_metrics:
server_args.append('--metrics')
- if context.model_alias is not None:
+ if context.model_alias:
server_args.extend(['--alias', context.model_alias])
- if context.n_ctx is not None:
+ if context.n_ctx:
server_args.extend(['--ctx-size', context.n_ctx])
- if context.n_slots is not None:
+ if context.n_slots:
server_args.extend(['--parallel', context.n_slots])
- if context.n_server_predict is not None:
+ if context.n_server_predict:
server_args.extend(['--n-predict', context.n_server_predict])
- if context.server_api_key is not None:
+ if context.server_api_key:
server_args.extend(['--api-key', context.server_api_key])
+ if context.n_ga:
+ server_args.extend(['--grp-attn-n', context.n_ga])
+ if context.n_ga_w:
+ server_args.extend(['--grp-attn-w', context.n_ga_w])
if context.debug:
server_args.append('--verbose')
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
- print(f"starting server with: {context.server_path}", *server_args)
+ print(f"starting server with: {context.server_path} {server_args}\n")
context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]],
close_fds=True)
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
index e228b2371..cf14b3b44 100644
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -1,4 +1,4 @@
-# run with ./test.sh --tags wrong_usage
+# run with: ./tests.sh --no-skipped --tags wrong_usage
@wrong_usage
Feature: Wrong usage of llama.cpp server
@@ -7,7 +7,7 @@ Feature: Wrong usage of llama.cpp server
# or pass n_predict/max_tokens in the request.
Scenario: Infinite loop
Given a server listening on localhost:8080
- And a model file stories260K.gguf
+ And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
# Uncomment below to fix the issue
#And 64 server max tokens to predict
Then the server is starting
@@ -18,4 +18,5 @@ Feature: Wrong usage of llama.cpp server
# Uncomment below to fix the issue
#And 128 max tokens to predict
Given concurrent completion requests
+ Then the server is idle
Then all prompts are predicted
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 334fa4a70..2e4f42ad2 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,4 +1,6 @@
aiohttp~=3.9.3
behave~=1.2.6
+huggingface_hub~=0.20.3
+numpy~=1.24.4
openai~=0.25.0
prometheus-client~=0.20.0
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 17a4e6fc6..1c6c5695f 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -5,7 +5,7 @@ set -eu
if [ $# -lt 1 ]
then
# Start @llama.cpp scenario
- behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp
+ behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else
behave "$@"
fi
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d7abd7cbb..df0a27782 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,15 +1,16 @@
#pragma once
-#include
-#include
-#include
-#include
-#include
-#include
+#include "llama.h"
+#include "common.h"
#include "json.hpp"
-#include "../llava/clip.h"
+#include
+#include
+#include
+#include
+
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
@@ -37,125 +38,35 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-//
-// parallel
-//
+template
+static T json_value(const json &body, const std::string &key, const T &default_value) {
+ // Fallback null to default value
+ return body.contains(key) && !body.at(key).is_null()
+ ? body.value(key, default_value)
+ : default_value;
+}
-enum server_state {
- SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
- SERVER_STATE_READY, // Server is ready and model is loaded
- SERVER_STATE_ERROR // An error occurred, load_model failed
-};
-
-enum task_type {
- TASK_TYPE_COMPLETION,
- TASK_TYPE_CANCEL,
- TASK_TYPE_NEXT_RESPONSE,
- TASK_TYPE_METRICS
-};
-
-struct task_server {
- int id = -1; // to be filled by llama_server_queue
- int target_id;
- task_type type;
- json data;
- bool infill_mode = false;
- bool embedding_mode = false;
- int multitask_id = -1;
-};
-
-struct task_result {
- int id;
- int multitask_id = -1;
- bool stop;
- bool error;
- json result_json;
-};
-
-struct task_multi {
- int id;
- std::set subtasks_remaining{};
- std::vector results{};
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
- IDLE,
- PROCESSING,
-};
-
-enum slot_command
-{
- NONE,
- LOAD_PROMPT,
- RELEASE,
-};
-
-struct slot_params
-{
- bool stream = true;
- bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
- uint32_t seed = -1; // RNG seed
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_predict = -1; // new tokens to predict
-
- std::vector antiprompt;
-
- json input_prefix;
- json input_suffix;
-};
-
-struct slot_image
-{
- int32_t id;
-
- bool request_encode_image = false;
- float * image_embedding = nullptr;
- int32_t image_tokens = 0;
-
- clip_image_u8 * img_data;
-
- std::string prefix_prompt; // before of this image
-};
-
-// completion token output with probabilities
-struct completion_token_output
-{
- struct token_prob
- {
- llama_token tok;
- float prob;
- };
-
- std::vector probs;
- llama_token tok;
- std::string text_to_send;
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
-{
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
- {"tid", ss_tid.str()},
+ {"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
- log.merge_patch(
- {
- {"level", level},
- {"function", function},
- {"line", line},
- {"msg", message},
- });
+ log.merge_patch( {
+ {"level", level},
+ {"function", function},
+ {"line", line},
+ {"msg", message},
+ });
+
if (!extra.empty()) {
log.merge_patch(extra);
}
- std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
+ printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
} else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
@@ -168,8 +79,7 @@ static inline void server_log(const char *level, const char *function, int line,
for (const auto& el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
- snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
- ss << buf;
+ ss << " " << el.key() << "=" << value;
}
const std::string str = ss.str();
@@ -179,36 +89,25 @@ static inline void server_log(const char *level, const char *function, int line,
}
//
-// server utils
+// chat template utils
//
-template
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
- // Fallback null to default value
- return body.contains(key) && !body.at(key).is_null()
- ? body.value(key, default_value)
- : default_value;
-}
-
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
inline bool verify_custom_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages)
-{
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) {
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector str(messages.size() * 2);
std::vector chat(messages.size());
for (size_t i = 0; i < messages.size(); ++i) {
- auto &curr_msg = messages[i];
+ const auto & curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
@@ -228,252 +127,13 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
- std::string formatted_chat(buf.data(), res);
+ const std::string formatted_chat(buf.data(), res);
+
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}
-//
-// work queue utils
-//
-
-struct llama_server_queue {
- int id = 0;
- std::mutex mutex_tasks;
- bool running;
- // queues
- std::vector queue_tasks;
- std::vector queue_tasks_deferred;
- std::vector queue_multitasks;
- std::condition_variable condition_tasks;
- // callback functions
- std::function callback_new_task;
- std::function callback_finish_multitask;
- std::function callback_all_task_finished;
-
- // Add a new task to the end of the queue
- int post(task_server task) {
- std::unique_lock lock(mutex_tasks);
- if (task.id == -1) {
- task.id = id++;
- LOG_VERBOSE("new task id", {{"new_id", task.id}});
- }
- queue_tasks.push_back(std::move(task));
- condition_tasks.notify_one();
- return task.id;
- }
-
- // Add a new task, but defer until one slot is available
- void defer(task_server task) {
- std::unique_lock lock(mutex_tasks);
- queue_tasks_deferred.push_back(std::move(task));
- }
-
- // Get the next id for creating anew task
- int get_new_id() {
- std::unique_lock lock(mutex_tasks);
- int new_id = id++;
- LOG_VERBOSE("new task id", {{"new_id", new_id}});
- return new_id;
- }
-
- // Register function to process a new task
- void on_new_task(std::function callback) {
- callback_new_task = callback;
- }
-
- // Register function to process a multitask
- void on_finish_multitask(std::function callback) {
- callback_finish_multitask = callback;
- }
-
- // Register the function to be called when the batch of tasks is finished
- void on_all_tasks_finished(std::function callback) {
- callback_all_task_finished = callback;
- }
-
- // Call when the state of one slot is changed
- void notify_slot_changed() {
- // move deferred tasks back to main loop
- std::unique_lock lock(mutex_tasks);
- for (auto & task : queue_tasks_deferred) {
- queue_tasks.push_back(std::move(task));
- }
- queue_tasks_deferred.clear();
- }
-
- // end the start_loop routine
- void terminate() {
- {
- std::unique_lock lock(mutex_tasks);
- running = false;
- }
- condition_tasks.notify_all();
- }
-
- // Start the main loop.
- void start_loop() {
- running = true;
- while (true) {
- LOG_VERBOSE("new task may arrive", {});
- {
- while (true)
- {
- std::unique_lock lock(mutex_tasks);
- if (queue_tasks.empty()) {
- lock.unlock();
- break;
- }
- task_server task = queue_tasks.front();
- queue_tasks.erase(queue_tasks.begin());
- lock.unlock();
- LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
- callback_new_task(task);
- }
- LOG_VERBOSE("callback_all_task_finished", {});
- // process and update all the multitasks
- auto queue_iterator = queue_multitasks.begin();
- while (queue_iterator != queue_multitasks.end())
- {
- if (queue_iterator->subtasks_remaining.empty())
- {
- // all subtasks done == multitask is done
- task_multi current_multitask = *queue_iterator;
- callback_finish_multitask(current_multitask);
- // remove this multitask
- queue_iterator = queue_multitasks.erase(queue_iterator);
- }
- else
- {
- ++queue_iterator;
- }
- }
- // all tasks in the current loop is finished
- callback_all_task_finished();
- }
- LOG_VERBOSE("wait for new task", {});
- // wait for new task
- {
- std::unique_lock lock(mutex_tasks);
- if (queue_tasks.empty()) {
- if (!running) {
- LOG_VERBOSE("ending start_loop", {});
- return;
- }
- condition_tasks.wait(lock, [&]{
- return (!queue_tasks.empty() || !running);
- });
- }
- }
- }
- }
-
- //
- // functions to manage multitasks
- //
-
- // add a multitask by specifying the id of all subtask (subtask is a task_server)
- void add_multitask(int multitask_id, std::vector& sub_ids)
- {
- std::lock_guard lock(mutex_tasks);
- task_multi multi;
- multi.id = multitask_id;
- std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
- queue_multitasks.push_back(multi);
- }
-
- // updatethe remaining subtasks, while appending results to multitask
- void update_multitask(int multitask_id, int subtask_id, task_result& result)
- {
- std::lock_guard lock(mutex_tasks);
- for (auto& multitask : queue_multitasks)
- {
- if (multitask.id == multitask_id)
- {
- multitask.subtasks_remaining.erase(subtask_id);
- multitask.results.push_back(result);
- }
- }
- }
-};
-
-struct llama_server_response {
- typedef std::function callback_multitask_t;
- callback_multitask_t callback_update_multitask;
- // for keeping track of all tasks waiting for the result
- std::set waiting_task_ids;
- // the main result queue
- std::vector