diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index c01006efe..f6073f662 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 6d5943a2f..6f19afa9c 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index 23f428944..2aec4a85d 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+RUN apt-get update && \
+ apt-get install -y libgomp1
+
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
index 37576d68e..dcaeb3e72 100644
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile
index 6c2b2ed5b..1bdb52803 100644
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 763d75fce..d2514c4ba 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,10 +9,13 @@ WORKDIR /app
COPY . .
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
FROM ubuntu:$UBUNTU_VERSION as runtime
+RUN apt-get update && \
+ apt-get install -y libgomp1
+
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 7f5228185..4e9747b82 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
- apt-get install -y libcurl4-openssl-dev
+ apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index 0d09d3627..bee63b966 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,12 +11,12 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
- apt-get install -y libcurl4-openssl-dev
+ apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7b616281b..93669d531 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -294,12 +294,22 @@ jobs:
- name: Build
id: cmake_build
+ if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
+ cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+
- name: Test
id: cmake_test
run: |
@@ -678,8 +688,6 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
- OPENCL_VERSION: 2023.04.17
- CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
@@ -696,8 +704,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'clblast-x64'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
@@ -722,27 +728,6 @@ jobs:
run: |
git submodule update --init kompute
- - name: Download OpenCL SDK
- id: get_opencl
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
- mkdir $env:RUNNER_TEMP/opencl
- tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
- - name: Download CLBlast
- id: get_clblast
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
- curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
- 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
- rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
- foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
- $txt = Get-Content -Path $f -Raw
- $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
- }
-
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -776,13 +761,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- - name: Add clblast.dll
- id: add_clblast_dll
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
- cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
@@ -806,7 +784,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
- if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+ if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -1061,7 +1039,7 @@ jobs:
# hypervisor: 'qemu'
# run: |
# sudo pkg update
-# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release:
diff --git a/.gitignore b/.gitignore
index 50ae0973a..5223c6963 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
lcov-report/
gcovr-report/
+tags
build*
!build.zig
cmake-build-*
+android-ndk-*
out/
tmp/
@@ -105,6 +107,7 @@ examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
+examples/server/*.css.hpp
poetry.lock
poetry.toml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52b392a13..b1d6afbbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,7 +111,6 @@ option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for Flas
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
-option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
@@ -126,6 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
+option(LLAMA_OPENMP "llama: use OpenMP" ON)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -296,6 +296,17 @@ if (LLAMA_METAL)
)
endif()
+if (LLAMA_OPENMP)
+ find_package(OpenMP)
+ if (OpenMP_FOUND)
+ message(STATUS "OpenMP found")
+ add_compile_definitions(GGML_USE_OPENMP)
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+ else()
+ message(WARNING "OpenMP not found")
+ endif()
+endif()
+
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@@ -405,6 +416,8 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -490,22 +503,6 @@ if (LLAMA_RPC)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
-if (LLAMA_CLBLAST)
- find_package(CLBlast)
- if (CLBlast_FOUND)
- message(STATUS "CLBlast found")
-
- set(GGML_HEADERS_OPENCL ggml-opencl.h)
- set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
-
- add_compile_definitions(GGML_USE_CLBLAST)
-
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
- else()
- message(WARNING "CLBlast not found")
- endif()
-endif()
-
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
@@ -545,12 +542,17 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
- if ($ENV{ROCM_PATH})
- set(ROCM_PATH $ENV{ROCM_PATH})
+ if (NOT EXISTS $ENV{ROCM_PATH})
+ if (NOT EXISTS /opt/rocm)
+ set(ROCM_PATH /usr)
+ else()
+ set(ROCM_PATH /opt/rocm)
+ endif()
else()
- set(ROCM_PATH /opt/rocm)
+ set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+ list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
@@ -588,6 +590,8 @@ if (LLAMA_HIPBLAS)
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@@ -777,6 +781,7 @@ if (LLAMA_KOMPUTE)
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp
+ kompute-shaders/op_getrows_f32.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
@@ -809,6 +814,7 @@ if (LLAMA_KOMPUTE)
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q6_k.h
+ shaderop_getrows_f32.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
@@ -1246,7 +1252,6 @@ add_library(ggml OBJECT
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
- ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@@ -1334,8 +1339,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
- "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
- "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
+ "${GGML_HEADERS_CUDA}"
+ "${GGML_HEADERS_METAL}"
+ "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
@@ -1371,6 +1377,13 @@ if (LLAMA_METAL)
endif()
endif()
+configure_file(cmake/llama.pc.in
+ "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ @ONLY)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ DESTINATION lib/pkgconfig)
+
#
# programs, examples and tests
#
diff --git a/Makefile b/Makefile
index c643fe0cc..895c62f84 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
LLAMA_METAL := 1
endif
+ LLAMA_NO_OPENMP := 1
+
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
+ifdef LLAMA_RPC
+ BUILD_TARGETS += rpc-server
+endif
+
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
ifdef LLAMA_FAST
MK_CFLAGS += -Ofast
HOST_CXXFLAGS += -Ofast
+ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
+endif # LLAMA_DEBUG
else
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
+ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
-endif
+endif # LLAMA_DEBUG
+endif # LLAMA_FAST
ifndef LLAMA_NO_CCACHE
CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
endif
ifdef LLAMA_DEBUG
- MK_CFLAGS += -O0 -g
- MK_CXXFLAGS += -O0 -g
- MK_LDFLAGS += -g
+ MK_CFLAGS += -O0 -g
+ MK_CXXFLAGS += -O0 -g
+ MK_LDFLAGS += -g
+ MK_NVCCFLAGS += -O0 -g
ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -400,6 +411,12 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE
+ifndef LLAMA_NO_OPENMP
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
+ MK_CFLAGS += -fopenmp
+ MK_CXXFLAGS += -fopenmp
+endif # LLAMA_NO_OPENMP
+
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -416,12 +433,18 @@ ifdef LLAMA_BLIS
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
+ifdef LLAMA_RPC
+ MK_CPPFLAGS += -DGGML_USE_RPC
+ OBJS += ggml-rpc.o
+endif # LLAMA_RPC
+
ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
@@ -525,23 +548,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
$(NVCC_COMPILE)
endif # LLAMA_CUDA
-ifdef LLAMA_CLBLAST
- MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
- MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
- MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
-
- # Mac provides OpenCL as a framework
- ifeq ($(UNAME_S),Darwin)
- MK_LDFLAGS += -lclblast -framework OpenCL
- else
- MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
- endif
- OBJS += ggml-opencl.o
-
-ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
- $(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_CLBLAST
-
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
@@ -641,11 +647,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif
endif # LLAMA_METAL
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
+ifdef LLAMA_RPC
+ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # LLAMA_RPC
+
GF_CC := $(CC)
include scripts/get-flags.mk
@@ -725,14 +746,9 @@ unicode.o: unicode.cpp unicode.h
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
-
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -882,10 +898,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README-sycl.md b/README-sycl.md
index 37f0306dc..62b38135c 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intelยฎ DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intelยฎ DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## News
diff --git a/README.md b/README.md
index 8680460aa..09e8cad31 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ variety of hardware - locally and in the cloud.
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
-- Vulkan, SYCL, and (partial) OpenCL backend support
+- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
cmake --build build --config Debug
```
-- Using `Zig` (version 0.11 or later):
-
- Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
- it's also possible to cross compile for other operating systems and architectures:
-
- ```bash
- zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
- ```
-
- The `zig targets` command will give you valid options to use.
-
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -382,16 +371,11 @@ In order to build llama.cpp you have four different options.
3. Install compilation dependencies.
```bash
- sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
- opencl clblast openblas
+ sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
- **Notes:** With this packages you can build llama.cpp with OPENBLAS and
- CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
- the instructions for use and activate this options in this document below.
-
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
@@ -410,7 +394,7 @@ argument.
### BLAS Build
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
@@ -564,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-- #### CLBlast
-
- OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
-
- You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
-
- - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
- -
- Installing the OpenCL SDK from source
-
- ```sh
- git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
- cd OpenCL-SDK
- cmake -B build -DBUILD_DOCS=OFF \
- -DBUILD_EXAMPLES=OFF \
- -DBUILD_TESTING=OFF \
- -DOPENCL_SDK_BUILD_SAMPLES=OFF \
- -DOPENCL_SDK_TEST_SAMPLES=OFF
- cmake --build build
- cmake --install build --prefix /some/path
- ```
-
-
- ##### Installing CLBlast
-
- Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
- Linux packaging:
- Fedora Linux:
- ```bash
- sudo dnf install clblast
- ```
-
- Alternatively, they may be built from source.
-
- -
- Windows:
-
- ```cmd
- set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
- git clone https://github.com/CNugteren/CLBlast.git
- cd CLBlast
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
- cmake --build build --config Release
- cmake --install build --prefix C:/CLBlast
- ```
-
- (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
-
- -
- Unix:
-
- ```sh
- git clone https://github.com/CNugteren/CLBlast.git
- cd CLBlast
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
- cmake --build build --config Release
- cmake --install build --prefix /some/path
- ```
-
- Where `/some/path` is where the built library will be installed (default is `/usr/local`).
-
-
- ##### Building Llama with CLBlast
-
- - Build with make:
- ```sh
- make LLAMA_CLBLAST=1
- ```
- - CMake (Unix):
- ```sh
- cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
- cmake --build build --config Release
- ```
- - CMake (Windows):
- ```cmd
- set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
- git clone https://github.com/ggerganov/llama.cpp
- cd llama.cpp
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
- cmake --build build --config Release
- cmake --install build --prefix C:/LlamaCPP
- ```
-
- ##### Running Llama with CLBlast
-
- The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
-
- To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
- The selection can be a number (starting from 0) or a text string to search:
-
- ```sh
- GGML_OPENCL_PLATFORM=1 ./main ...
- GGML_OPENCL_DEVICE=2 ./main ...
- GGML_OPENCL_PLATFORM=Intel ./main ...
- GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
- ```
-
- The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
- Using the variables it is possible to select a CPU-based driver as well, if so desired.
-
- You can get a list of platforms and devices from the `clinfo -l` command, etc.
-
- #### Vulkan
**With docker**:
@@ -719,7 +598,7 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 46fba6514..802379680 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in
new file mode 100644
index 000000000..326acbb61
--- /dev/null
+++ b/cmake/llama.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: llama
+Description: Port of Facebook's LLaMA model in C/C++
+Version: @PROJECT_VERSION@
+Libs: -L${libdir} -lllama
+Cflags: -I${includedir}
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0ec8d6d8d..171530c91 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -84,4 +84,4 @@ endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/common.cpp b/common/common.cpp
index 65103c3c2..1591790e6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,20 +7,21 @@
#include
#include
+#include
#include
+#include
+#include
#include
#include
#include
-#include
#include
+#include
#include
#include
#include
#include
#include
#include
-#include
-#include
#if defined(__APPLE__) && defined(__MACH__)
#include
@@ -199,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
- std::string cache_directory = fs_get_cache_directory();
- const bool success = fs_create_directory_with_parents(cache_directory);
- if (!success) {
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
- }
- params.model = cache_directory + string_split(params.hf_file, '/').back();
+ params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front();
- f = string_split(f, '/').back();
- params.model = "models/" + f;
+ params.model = fs_get_cache_file(string_split(f, '/').back());
}
} else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH;
@@ -237,10 +232,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
}
- if (params.prompt_cache_all &&
- (params.interactive || params.interactive_first ||
- params.instruct)) {
-
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
@@ -265,22 +257,26 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
- bool result = true;
+ const auto params_org = params; // the example can modify the default params
+
try {
- if (!gpt_params_parse_ex(argc, argv, params)) {
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(0);
+ if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+ params = params_org;
+ params.usage = true;
+ return false;
}
- }
- catch (const std::invalid_argument & ex) {
+ } catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(1);
+ params = params_org;
+ return false;
}
- return result;
+
+ return true;
}
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+ const char split_delim = ',';
+
llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
@@ -288,7 +284,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
- // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
+ // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
@@ -349,6 +345,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.escape = true;
return true;
}
+ if (arg == "--no-escape") {
+ params.escape = false;
+ return true;
+ }
if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
@@ -403,7 +403,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
- if (arg == "-n" || arg == "--n-predict") {
+ if (arg == "--in-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.in_files.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
return true;
@@ -900,34 +914,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.interactive = true;
return true;
}
- if (arg == "--interactive-specials") {
- params.interactive_specials = true;
- return true;
- }
- if (arg == "--special") {
+ if (arg == "-sp" || arg == "--special") {
params.special = true;
return true;
}
- if (arg == "--embedding") {
+ if (arg == "--embedding" || arg == "--embeddings") {
params.embedding = true;
return true;
}
- if (arg == "--interactive-first") {
+ if (arg == "-if" || arg == "--interactive-first") {
params.interactive_first = true;
return true;
}
- if (arg == "-ins" || arg == "--instruct") {
- params.instruct = true;
- return true;
- }
if (arg == "-cnv" || arg == "--conversation") {
params.conversation = true;
return true;
}
- if (arg == "-cml" || arg == "--chatml") {
- params.chatml = true;
- return true;
- }
if (arg == "--infill") {
params.infill = true;
return true;
@@ -964,7 +966,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.flash_attn = true;
return true;
}
- if (arg == "--color") {
+ if (arg == "-co" || arg == "--color") {
params.use_color = true;
return true;
}
@@ -972,26 +974,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.use_mlock = true;
return true;
}
- if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
}
- if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
@@ -1002,9 +1004,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
@@ -1030,9 +1032,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
-#ifndef GGML_USE_CUDA_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {
@@ -1087,6 +1089,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
+ if (arg == "-v" || arg == "--verbose") {
+ params.verbosity = 1;
+ return true;
+ }
+ if (arg == "--verbosity") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.verbosity = std::stoi(argv[i]);
+ return true;
+ }
if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
return true;
@@ -1151,6 +1165,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.ppl_stride = std::stoi(argv[i]);
return true;
}
+ if (arg == "--ppl-output-type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_output_type = std::stoi(argv[i]);
+ return true;
+ }
if (arg == "-ptc" || arg == "--print-token-count") {
if (++i >= argc) {
invalid_param = true;
@@ -1163,14 +1185,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.check_tensors = true;
return true;
}
- if (arg == "--ppl-output-type") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.ppl_output_type = std::stoi(argv[i]);
- return true;
- }
if (arg == "--hellaswag") {
params.hellaswag = true;
return true;
@@ -1242,19 +1256,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
- if (arg == "-h" || arg == "--help") {
- gpt_params_print_usage(argc, argv, gpt_params());
- exit(0);
+ if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
+ params.usage = true;
+ return true;
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
- if (arg == "--random-prompt") {
- params.random_prompt = true;
- return true;
- }
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
return true;
@@ -1321,6 +1331,285 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
+ if (arg == "--host") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hostname = argv[i];
+ return true;
+ }
+ if (arg == "--port") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.port = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.public_path = argv[i];
+ return true;
+ }
+ if (arg == "--api-key") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.api_keys.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--api-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream key_file(argv[i]);
+ if (!key_file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string key;
+ while (std::getline(key_file, key)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
+ key_file.close();
+ return true;
+ }
+ if (arg == "--ssl-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_key = argv[i];
+ return true;
+ }
+ if (arg == "--ssl-cert-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_cert = argv[i];
+ return true;
+ }
+ if (arg == "--timeout" || arg == "-to") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.timeout_read = std::stoi(argv[i]);
+ params.timeout_write = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--threads-http") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_http = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-spf" || arg == "--system-prompt-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string system_prompt;
+ std::copy(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator(),
+ std::back_inserter(system_prompt)
+ );
+ params.system_prompt = system_prompt;
+ return true;
+ }
+ if (arg == "--log-format") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (std::strcmp(argv[i], "json") == 0) {
+ params.log_json = true;
+ } else if (std::strcmp(argv[i], "text") == 0) {
+ params.log_json = false;
+ } else {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ if (arg == "--no-slots") {
+ params.endpoint_slots = false;
+ return true;
+ }
+ if (arg == "--metrics") {
+ params.endpoint_metrics = true;
+ return true;
+ }
+ if (arg == "--slot-save-path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.slot_save_path = argv[i];
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.slot_save_path += DIRECTORY_SEPARATOR;
+ }
+ return true;
+ }
+ if (arg == "--chat-template") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (!llama_chat_verify_template(argv[i])) {
+ fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
+ fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
+ invalid_param = true;
+ return true;
+ }
+ params.chat_template = argv[i];
+ return true;
+ }
+ if (arg == "--slot-prompt-similarity" || arg == "-sps") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.slot_prompt_similarity = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "-pps") {
+ params.is_pp_shared = true;
+ return true;
+ }
+ if (arg == "-npp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-ntg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-npl") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "--context-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.context_files.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_size = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-separator") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_separator = argv[i];
+ return true;
+ }
+ if (arg == "--junk") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_junk = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--pos") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.i_pos = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-o" || arg == "--output" || arg == "--output-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.out_file = argv[i];
+ return true;
+ }
+ if (arg == "-ofreq" || arg == "--output-frequency") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_out_freq = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--save-frequency") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_save_freq = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--process-output") {
+ params.process_output = true;
+ return true;
+ }
+ if (arg == "--no-ppl") {
+ params.compute_ppl = false;
+ return true;
+ }
+ if (arg == "--chunk" || arg == "--from-chunk") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.i_chunk = std::stoi(argv[i]);
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1348,6 +1637,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return false;
}
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams;
@@ -1359,198 +1658,303 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
}
sampler_type_names.pop_back();
- printf("\n");
- printf("usage: %s [options]\n", argv[0]);
- printf("\n");
- printf("options:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" --version show version and build info\n");
- printf(" -i, --interactive run in interactive mode\n");
- printf(" --special special tokens output enabled\n");
- printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
- printf(" --interactive-first run in interactive mode and wait for input right away\n");
- printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
- printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
- printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
- printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
- printf(" -r PROMPT, --reverse-prompt PROMPT\n");
- printf(" halt generation at PROMPT, return control in interactive mode\n");
- printf(" (can be specified more than once for multiple prompts).\n");
- printf(" --color colorise output to distinguish prompt and user input from generations\n");
- printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
- printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
- printf(" -tb N, --threads-batch N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
- printf(" -td N, --threads-draft N");
- printf(" number of threads to use during generation (default: same as --threads)\n");
- printf(" -tbd N, --threads-batch-draft N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
- printf(" -p PROMPT, --prompt PROMPT\n");
- printf(" prompt to start generation with (default: empty)\n");
- printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
- printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
- printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
- printf(" not supported with --interactive or other interactive options\n");
- printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
- printf(" --random-prompt start with a randomized prompt.\n");
- printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
- printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
- printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
- printf(" -f FNAME, --file FNAME\n");
- printf(" prompt file to start generation.\n");
- printf(" -bf FNAME, --binary-file FNAME\n");
- printf(" binary file containing multiple choice tasks.\n");
- printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
- printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
- printf(" -ub N, --ubatch-size N\n");
- printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
- printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
- printf(" (default: %s)\n", sampler_type_names.c_str());
- printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
- printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
- printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
- printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
- printf(" --mirostat N use Mirostat sampling.\n");
- printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
- printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
- printf(" modifies the likelihood of token appearing in the completion,\n");
- printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
- printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
- printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
- printf(" --grammar-file FNAME file to read grammar from\n");
- printf(" -j SCHEMA, --json-schema SCHEMA\n");
- printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
- printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
- printf(" --cfg-negative-prompt PROMPT\n");
- printf(" negative prompt to use for guidance. (default: empty)\n");
- printf(" --cfg-negative-prompt-file FNAME\n");
- printf(" negative prompt file to use for guidance. (default: empty)\n");
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
- printf(" --rope-scaling {none,linear,yarn}\n");
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
- printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
- printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
- printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls}\n");
- printf(" pooling type for embeddings, use model default if unspecified\n");
- printf(" -dt N, --defrag-thold N\n");
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
- printf(" --penalize-nl penalize newline tokens\n");
- printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
- printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
- printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
- printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
- printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
- printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
- printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
- printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
- printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
- printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
- printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
- printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
- printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
- printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
- printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
- printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
+ struct option_info {
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
+ option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
+ va_list args_list;
+ va_start(args_list, desc);
+ char buffer[1024];
+ vsnprintf(buffer, sizeof(buffer), desc, args_list);
+ va_end(args_list);
+ this->desc = buffer;
+ }
+
+ option_info(const std::string & grp) : grp(grp) {}
+
+ std::string tags;
+ std::string args;
+ std::string desc;
+ std::string grp;
+ };
+
+ std::vector options;
+
+ // TODO: filter by tags
+
+ options.push_back({ "general" });
+ options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
+ options.push_back({ "*", " --version", "show version and build info" });
+ options.push_back({ "*", "-v, --verbose", "print verbose information" });
+ options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
+ options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
+ options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
+ options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
+ options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
+ options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
+ options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+ options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
+ options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
+ options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
+ "path to static lookup cache to use for lookup decoding (not updated by generation)" });
+ options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
+
+ options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+ options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+ options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
+ options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
+ options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
+ options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
+ options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+ options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
+ options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
+ options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
+ options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
+ options.push_back({ "*", " --no-escape", "do not process escape sequences" });
+ options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
+ options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
+ options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
+ "not supported with --interactive or other interactive options" });
+ options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
+ options.push_back({ "main", "-r, --reverse-prompt PROMPT",
+ "halt generation at PROMPT, return control in interactive mode\n"
+ "can be specified more than once for multiple prompts" });
+ options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
+ options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
+ options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
+ options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
+ options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
+ options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+
+ options.push_back({ "sampling" });
+ options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
+ "(default: %s)", sampler_type_names.c_str() });
+ options.push_back({ "*", " --sampling-seq SEQUENCE",
+ "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
+ options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
+ options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
+ options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
+ options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
+ options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+ options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+ options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+ options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+ options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
+ options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
+ options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
+ options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
+ options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
+ options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
+ options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
+ "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
+ options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
+ options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
+ options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
+ options.push_back({ "main", " --cfg-negative-prompt PROMPT",
+ "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
+ options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
+ "negative prompt file to use for guidance" });
+ options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
+
+ options.push_back({ "grammar" });
+ options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
+ options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
+ options.push_back({ "*", "-j, --json-schema SCHEMA",
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
+ "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
+
+ options.push_back({ "embedding" });
+ options.push_back({ "embedding", " --pooling {none,mean,cls}",
+ "pooling type for embeddings, use model default if unspecified" });
+
+ options.push_back({ "context hacking" });
+ options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
+ "RoPE frequency scaling method, defaults to linear unless specified by the model" });
+ options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
+ options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
+ options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
+ options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
+ options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
+ options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
+ options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
+ options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
+ options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
+ options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
+ options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
+ options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
+ options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
+ options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+
+ options.push_back({ "perplexity" });
+ options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
+ options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
+ options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
+ options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --multiple-choice-tasks N",
+ "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
+ options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
+ options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
+ options.push_back({ "perplexity", " --ppl-output-type {0,1}",
+ "output type for perplexity calculation (default: %d)", params.ppl_output_type });
+
+ options.push_back({ "parallel" });
+ options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
+ options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
+ options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
+ options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+
+ options.push_back({ "multi-modality" });
+ options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
+ options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
+
+ options.push_back({ "backend" });
+ options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
if (llama_supports_mlock()) {
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
}
if (llama_supports_mmap()) {
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+ options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
}
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
- printf(" - distribute: spread execution evenly over all nodes\n");
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
- printf(" - numactl: use the CPU map provided by numactl\n");
- printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
- printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+ options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
+ " - distribute: spread execution evenly over all nodes\n"
+ " - isolate: only spawn threads on CPUs on the node that execution started on\n"
+ " - numactl: use the CPU map provided by numactl\n"
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
+ "see https://github.com/ggerganov/llama.cpp/issues/1437" });
+
if (llama_supports_gpu_offload()) {
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -ngld N, --n-gpu-layers-draft N\n");
- printf(" number of layers to store in VRAM for the draft model\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT, --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+ options.push_back({ "*", "-ngl, --gpu-layers N",
+ "number of layers to store in VRAM" });
+ options.push_back({ "*", "-ngld, --gpu-layers-draft N",
+ "number of layers to store in VRAM for the draft model" });
+ options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
+ "how to split the model across multiple GPUs, one of:\n"
+ " - none: use one GPU only\n"
+ " - layer (default): split layers and KV across GPUs\n"
+ " - row: split rows across GPUs" });
+ options.push_back({ "*", "-ts, --tensor-split SPLIT",
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+ options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
+ "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
}
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
- printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
- printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
- printf(" -gan N, --grp-attn-n N\n");
- printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
- printf(" -gaw N, --grp-attn-w N\n");
- printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
- printf(" -dkvc, --dump-kv-cache\n");
- printf(" verbose print of the KV cache\n");
- printf(" -nkvo, --no-kv-offload\n");
- printf(" disable KV offload\n");
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
- printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
- printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
- printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
- printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
- printf(" --control-vector FNAME\n");
- printf(" add a control vector\n");
- printf(" --control-vector-scaled FNAME S\n");
- printf(" add a control vector with user defined scaling S\n");
- printf(" --control-vector-layer-range START END\n");
- printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
- printf(" -m FNAME, --model FNAME\n");
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
- printf(" -md FNAME, --model-draft FNAME\n");
- printf(" draft model for speculative decoding (default: unused)\n");
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: unused)\n");
- printf(" -hfr REPO, --hf-repo REPO\n");
- printf(" Hugging Face model repository (default: unused)\n");
- printf(" -hff FILE, --hf-file FILE\n");
- printf(" Hugging Face model file (default: unused)\n");
- printf(" -ld LOGDIR, --logdir LOGDIR\n");
- printf(" path under which to save YAML logs (no logging if unset)\n");
- printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
- printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
- printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
- printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
- printf(" --override-kv KEY=TYPE:VALUE\n");
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -ptc N, --print-token-count N\n");
- printf(" print token count every N tokens (default: %d)\n", params.n_print);
- printf(" --check-tensors check model tensor data for invalid values\n");
- printf("\n");
+
+ options.push_back({ "model" });
+ options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
+ options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
+ "advanced option to override model metadata by key. may be specified multiple times.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
+ options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
+ "add a control vector with user defined scaling SCALE" });
+ options.push_back({ "*", " --control-vector-layer-range START END",
+ "layer range to apply the control vector(s) to, start and end inclusive" });
+ options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
+ "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+ options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
+ options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
+ options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
+ options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
+
+ options.push_back({ "retrieval" });
+ options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
+ options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
+ options.push_back({ "retrieval", " --chunk-separator STRING",
+ "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
+
+ options.push_back({ "passkey" });
+ options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
+ options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
+
+ options.push_back({ "imatrix" });
+ options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
+ options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
+ options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
+ options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
+ options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
+ options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
+
+ options.push_back({ "bench" });
+ options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
+ options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
+ options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
+ options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
+
+ options.push_back({ "server" });
+ options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
+ options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
+ options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
+ options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+ options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
+ options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
+ options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
+ options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
+ options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
+ options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
+ options.push_back({ "server", " --system-prompt-file FNAME",
+ "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
+ options.push_back({ "server", " --log-format {text,json}",
+ "log output format: json or text (default: json)" });
+ options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
+ options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
+ options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
+ options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "only commonly used templates are accepted:\n"
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
+ options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
+ "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
+
#ifndef LOG_DISABLE_LOGS
- log_print_usage();
+ options.push_back({ "logging" });
+ options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
+ options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
+ options.push_back({ "logging", " --log-test", "Run simple logging test" });
+ options.push_back({ "logging", " --log-disable", "Disable trace logs" });
+ options.push_back({ "logging", " --log-enable", "Enable trace logs" });
+ options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
+ options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
+ "Each log file will have unique name: \"..log\"" });
+ options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
#endif // LOG_DISABLE_LOGS
+
+ printf("usage: %s [options]\n", argv[0]);
+
+ for (const auto & o : options) {
+ if (!o.grp.empty()) {
+ printf("\n%s:\n\n", o.grp.c_str());
+ continue;
+ }
+ printf(" %-32s", o.args.c_str());
+ if (o.args.length() > 30) {
+ printf("\n%34s", "");
+ }
+
+ const auto desc = o.desc;
+ size_t start = 0;
+ size_t end = desc.find('\n');
+ while (end != std::string::npos) {
+ printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
+ start = end + 1;
+ end = desc.find('\n', start);
+ }
+
+ printf("%s\n", desc.substr(start).c_str());
+ }
+ printf("\n");
}
std::string gpt_params_get_system_info(const gpt_params & params) {
@@ -1610,24 +2014,6 @@ std::string string_get_sortable_timestamp() {
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
}
-std::string string_random_prompt(std::mt19937 & rng) {
- const int r = rng() % 10;
- switch (r) {
- case 0: return "So";
- case 1: return "Once upon a time";
- case 2: return "When";
- case 3: return "The";
- case 4: return "After";
- case 5: return "If";
- case 6: return "import";
- case 7: return "He";
- case 8: return "She";
- case 9: return "They";
- }
-
- GGML_UNREACHABLE();
-}
-
void string_process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -1887,6 +2273,16 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory);
}
+std::string fs_get_cache_file(const std::string & filename) {
+ GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
+ std::string cache_directory = fs_get_cache_directory();
+ const bool success = fs_create_directory_with_parents(cache_directory);
+ if (!success) {
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
+ }
+ return cache_directory + filename;
+}
+
//
// Model utils
@@ -2503,6 +2899,12 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
+bool llama_chat_verify_template(const std::string & tmpl) {
+ llama_chat_message chat[] = {{"user", "test"}};
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+ return res >= 0;
+}
+
//
// KV cache utils
//
@@ -2844,7 +3246,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
- fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
@@ -2903,9 +3304,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
- fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -2955,7 +3354,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
fprintf(stream, "reverse_prompt:\n");
diff --git a/common/common.h b/common/common.h
index 264504830..2345d855e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,66 +56,67 @@ struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
- int32_t n_threads_draft = -1;
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
- int32_t n_threads_batch_draft = -1;
- int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 512; // context size
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
- int32_t n_parallel = 1; // number of parallel sequences to decode
- int32_t n_sequences = 1; // number of sequences to decode
- float p_split = 0.1f; // speculative decoding split probability
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
- llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
- int32_t n_beams = 0; // if non-zero then use beam search of given width.
- int32_t grp_attn_n = 1; // group-attention factor
- int32_t grp_attn_w = 512; // group-attention width
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
- float rope_freq_base = 0.0f; // RoPE base frequency
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
+ int32_t n_threads_draft = -1;
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
+ int32_t n_threads_batch_draft = -1;
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 0; // context size
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
+ int32_t n_parallel = 1; // number of parallel sequences to decode
+ int32_t n_sequences = 1; // number of sequences to decode
+ float p_split = 0.1f; // speculative decoding split probability
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
+ int32_t grp_attn_n = 1; // group-attention factor
+ int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
+ float rope_freq_base = 0.0f; // RoPE base frequency
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
- int32_t yarn_orig_ctx = 0; // YaRN original context length
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
- std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
- std::string model = ""; // model path
- std::string model_draft = ""; // draft model for speculative decoding
+ std::string model = ""; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
- std::string model_url = ""; // model url to download
- std::string hf_repo = ""; // HF repo
- std::string hf_file = ""; // HF file
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
std::string prompt = "";
- std::string prompt_file = ""; // store the external prompt file name
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
- std::string input_prefix = ""; // string to prefix user inputs with
- std::string input_suffix = ""; // string to suffix user inputs with
- std::vector antiprompt; // string upon seeing which more user input is prompted
- std::string logdir = ""; // directory in which to save YAML log files
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
+ std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
- std::string logits_file = ""; // file for saving *all* logits
+ std::string logits_file = ""; // file for saving *all* logits
+ std::string rpc_servers = ""; // comma separated list of RPC servers
+ std::vector in_files; // all input files
+ std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector kv_overrides;
// TODO: avoid tuple, use struct
@@ -124,37 +125,36 @@ struct gpt_params {
std::vector control_vectors; // control vector with user defined scale
+ int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
- int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
- int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
- // (which is more convenient to use for plotting)
- //
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ // (which is more convenient to use for plotting)
+ //
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
- size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
- bool kl_divergence = false; // compute KL divergence
+ bool kl_divergence = false; // compute KL divergence
- bool random_prompt = false; // do not randomize prompt if none provided
+ bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
- bool interactive = false; // interactive mode
- bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
+ bool interactive = false; // interactive mode
+ bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
- bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
- bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
- bool interactive_first = false; // wait for user input immediately
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
@@ -162,7 +162,6 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
- bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
@@ -180,6 +179,59 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector image; // path to image file(s)
+
+ // server params
+ int32_t port = 8080; // server listens on this network port
+ int32_t timeout_read = 600; // http read timeout in seconds
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests
+
+ std::string hostname = "127.0.0.1";
+ std::string public_path = "";
+ std::string chat_template = "";
+ std::string system_prompt = "";
+
+ std::vector api_keys;
+
+ std::string ssl_file_key = "";
+ std::string ssl_file_cert = "";
+
+ bool endpoint_slots = true;
+ bool endpoint_metrics = false;
+
+ bool log_json = false;
+
+ std::string slot_save_path;
+
+ float slot_prompt_similarity = 0.5f;
+
+ // batched-bench params
+ bool is_pp_shared = false;
+
+ std::vector n_pp;
+ std::vector n_tg;
+ std::vector n_pl;
+
+ // retrieval params
+ std::vector context_files; // context files to embed
+
+ int32_t chunk_size = 64; // chunk size for context embedding
+
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+ // passkey params
+ int32_t n_junk = 250; // number of times to repeat the junk text
+ int32_t i_pos = -1; // position of the passkey in the junk text
+
+ // imatrix params
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
+ int32_t i_chunk = 0; // start processing from this chunk
+
+ bool process_output = false; // collect data for the output tensor
+ bool compute_ppl = true; // whether to compute perplexity
};
void gpt_params_handle_model_default(gpt_params & params);
@@ -199,7 +251,20 @@ std::vector string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
-std::string string_random_prompt(std::mt19937 & rng);
+
+template
+static std::vector string_split(const std::string & str, char delim) {
+ std::vector values;
+ std::istringstream str_stream(str);
+ std::string token;
+ while (std::getline(str_stream, token, delim)) {
+ T value;
+ std::istringstream token_stream(token);
+ token_stream >> value;
+ values.push_back(value);
+ }
+ return values;
+}
bool string_parse_kv_override(const char * data, std::vector & overrides);
void string_process_escapes(std::string & input);
@@ -212,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
+std::string fs_get_cache_file(const std::string & filename);
//
// Model utils
@@ -282,6 +348,13 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
+//
+// Chat template utils
+//
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl);
+
//
// KV cache utils
//
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index b5bc7d49b..a518b766d 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -46,8 +46,12 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
+ static bool is_digit_char(char c) {
+ return '0' <= c && c <= '9';
+ }
+
static bool is_word_char(char c) {
- return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
return pos;
}
+ static const char * parse_int(const char * src) {
+ const char * pos = src;
+ while (is_digit_char(*pos)) {
+ pos++;
+ }
+ if (pos == src) {
+ throw std::runtime_error(std::string("expecting integer at ") + src);
+ }
+ return pos;
+ }
+
static std::pair parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
+
+ auto handle_repetitions = [&](int min_times, int max_times) {
+
+ if (last_sym_start == out_elements.size()) {
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+ }
+
+ // apply transformation to previous symbol (last_sym_start to end) according to
+ // the following rewrite rules:
+ // S{m,n} --> S S S (m times) S'(n-m)
+ // S'(x) ::= S S'(x-1) |
+ // (... n-m definitions of these S' rules ...)
+ // S'(1) ::= S |
+ // S{m,} --> S S S (m times) S'
+ // S' ::= S S' |
+ // S* --> S{0,}
+ // --> S' ::= S S' |
+ // S+ --> S{1,}
+ // --> S S'
+ // S' ::= S S' |
+ // S? --> S{0,1}
+ // --> S'
+ // S' ::= S |
+
+ std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
+ if (min_times == 0) {
+ out_elements.resize(last_sym_start);
+ } else {
+ // Repeat the previous elements (min_times - 1) times
+ for (int i = 1; i < min_times; i++) {
+ out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
+ }
+ }
+
+ uint32_t last_rec_rule_id = 0;
+ auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+ std::vector rec_rule(previous_elements);
+ for (int i = 0; i < n_opt; i++) {
+ rec_rule.resize(previous_elements.size());
+ uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
+ if (i > 0 || max_times < 0) {
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+ }
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule(state, rec_rule_id, rec_rule);
+ last_rec_rule_id = rec_rule_id;
+ }
+ if (n_opt > 0) {
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+ }
+ };
+
while (*pos) {
if (*pos == '"') { // literal string
pos++;
@@ -197,40 +266,51 @@ namespace grammar_parser {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
- } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
- if (last_sym_start == out_elements.size()) {
- throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
- }
-
- // apply transformation to previous symbol (last_sym_start to end) according to
- // rewrite rules:
- // S* --> S' ::= S S' |
- // S+ --> S' ::= S S' | S
- // S? --> S' ::= S |
- uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
- std::vector sub_rule;
- // add preceding symbol to generated rule
- sub_rule.insert(
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
- if (*pos == '*' || *pos == '+') {
- // cause generated rule to recurse
- sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
- }
- // mark start of alternate def
- sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
- if (*pos == '+') {
- // add preceding symbol as alternate only for '+' (otherwise empty)
- sub_rule.insert(
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
- }
- sub_rule.push_back({LLAMA_GRETYPE_END, 0});
- add_rule(state, sub_rule_id, sub_rule);
-
- // in original rule, replace previous symbol with reference to generated rule
- out_elements.resize(last_sym_start);
- out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-
+ } else if (*pos == '.') { // any char
+ last_sym_start = out_elements.size();
+ out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '*') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, -1);
+ } else if (*pos == '+') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(1, -1);
+ } else if (*pos == '?') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, 1);
+ } else if (*pos == '{') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (!is_digit_char(*pos)) {
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
+ }
+ const char * int_end = parse_int(pos);
+ int min_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+
+ int max_times = -1;
+
+ if (*pos == '}') {
+ max_times = min_times;
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == ',') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (is_digit_char(*pos)) {
+ const char * int_end = parse_int(pos);
+ max_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+ }
+
+ if (*pos != '}') {
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else {
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
+ }
+ handle_repetitions(min_times, max_times);
} else {
break;
}
@@ -325,6 +405,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+ case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
@@ -339,6 +420,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
+ case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
@@ -350,6 +432,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
+ case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
@@ -407,11 +490,15 @@ namespace grammar_parser {
}
print_grammar_char(file, elem.value);
break;
+ case LLAMA_GRETYPE_CHAR_ANY:
+ fprintf(file, ".");
+ break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 9a71f5d8d..737bae27c 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
static std::string repeat(const std::string & str, size_t n);
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
+ auto has_max = max_items != std::numeric_limits::max();
+
+ if (min_items == 0 && max_items == 1) {
+ return item_rule + "?";
+ }
+
if (separator_rule.empty()) {
- if (min_items == 0 && max_items == 1) {
- return item_rule + "?";
- } else if (min_items == 1 && max_items == std::numeric_limits::max()) {
+ if (min_items == 1 && !has_max) {
return item_rule + "+";
- }
- }
-
- std::string result;
- if (min_items > 0) {
- if (item_rule_is_literal && separator_rule.empty()) {
- result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+ } else if (min_items == 0 && !has_max) {
+ return item_rule + "*";
} else {
- std::vector items(min_items, item_rule);
- result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+ return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
}
}
- std::function opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
- auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
-
- if (up_to_n == 0) {
- return "";
- } else if (up_to_n == 1) {
- return "(" + content + ")?";
- } else if (!separator_rule.empty() && !prefix_with_sep) {
- return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
- } else {
- std::string res = repeat("(" + content + " ", up_to_n);
- // strip trailing space
- res = res.substr(0, res.length() - 1);
- res += repeat(")?", up_to_n);
- return res;
- }
- };
-
- if (min_items > 0 && max_items != min_items) {
- result += " ";
+ auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
+ if (min_items == 0) {
+ result = "(" + result + ")?";
}
-
- if (max_items != std::numeric_limits::max()) {
- result += opt_repetitions(max_items - min_items, min_items > 0);
- } else {
- std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
- if (min_items == 0 && !separator_rule.empty()) {
- result = "(" + item_rule + " " + item_operator + "*)?";
- } else {
- result += item_operator + "*";
- }
- }
-
return result;
}
@@ -78,30 +47,24 @@ struct BuiltinRule {
std::vector deps;
};
-const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
-
std::unordered_map PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
- {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
- {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+ {"decimal-part", {"[0-9]{1,16}", {}}},
+ {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
- {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
- {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+ {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+ {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map STRING_FORMAT_RULES = {
- {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
- {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+ {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+ {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -385,8 +348,7 @@ private:
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
- "",
- sub_is_literal
+ ""
);
seq.back().second = false;
} else {
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 84b72348d..f43b15760 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@@ -82,6 +83,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+ {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ce7df52bd..025405a2c 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
from __future__ import annotations
@@ -46,7 +47,7 @@ class Model:
_model_classes: dict[str, type[Model]] = {}
dir_model: Path
- ftype: int
+ ftype: gguf.LlamaFileType
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
@@ -74,10 +75,10 @@ class Model:
self.use_temp_file = use_temp_file
self.lazy = not eager
self.model_name = model_name
- self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
+ self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
- self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
+ self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -95,7 +96,7 @@ class Model:
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
- self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+ self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod
def __init_subclass__(cls):
@@ -325,21 +326,21 @@ class Model:
def write(self):
self.write_tensors()
- self.gguf_writer.write_header_to_file()
+ self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()
def write_vocab(self):
- self.gguf_writer.write_header_to_file()
+ self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@staticmethod
- def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
+ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
- if filename.endswith(suffix):
+ if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename)
part_names.sort()
@@ -476,6 +477,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
+ if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
+ res = "jina-v2-code"
if res is None:
logger.warning("\n")
@@ -2453,11 +2457,13 @@ class JinaBertV2Model(BertModel):
def get_tensors(self):
for name, data in super().get_tensors():
- if 'gated_layers' in name:
+ if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
+ name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
+ name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40ee4ccb..53002f8e1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
- add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index bf0125e75..4f6c3746a 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
// store key and value to memory
{
@@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wqb,
cur)),
n_embd/n_head, n_head, N),
- KQ_pos, n_rot, 0, 0);
+ KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0,
@@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wkb,
cur)),
n_embd/n_head, n_head, N),
- KQ_pos, n_rot, 0, 0);
+ KQ_pos, n_rot, 0);
// store key and value to memory
{
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index bf951baf7..fa4baf640 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ]
+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 2924d8116..718f0a61a 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,67 +28,27 @@ static std::vector parse_list(char * p) {
return ret;
}
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] \n" , argv[0]);
- printf(" , and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
- return 1 ;
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- int n_kv_max = 2048;
- int n_batch = 2048;
- int n_ubatch = 512;
- bool flash_attn = false;
- int is_pp_shared = 0;
- int n_gpu_layers = 0;
+ int is_pp_shared = params.is_pp_shared;
- std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
- std::vector n_tg = { 128, 256, };
- std::vector n_pl = { 1, 2, 4, 8, 16, 32, };
- //std::vector n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_kv_max = std::atoi(argv[2]);
- }
-
- if (argc >= 4) {
- n_batch = std::atoi(argv[3]);
- }
-
- if (argc >= 5) {
- n_ubatch = std::atoi(argv[4]);
- }
-
- if (argc >= 6) {
- flash_attn = std::atoi(argv[5]);
- }
-
- if (argc >= 7) {
- is_pp_shared = std::atoi(argv[6]);
- }
-
- if (argc >= 8) {
- n_gpu_layers = std::atoi(argv[7]);
- }
-
- if (argc >= 9) {
- n_pp = parse_list(argv[8]);
- }
-
- if (argc >= 10) {
- n_tg = parse_list(argv[9]);
- }
-
- if (argc >= 11) {
- n_pl = parse_list(argv[10]);
- }
+ std::vector n_pp = params.n_pp;
+ std::vector n_tg = params.n_tg;
+ std::vector n_pl = params.n_pl;
// init LLM
@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- const std::vector t_split(llama_max_devices(), 0.0f);
-
- model_params.n_gpu_layers = n_gpu_layers;
- model_params.tensor_split = t_split.data();
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
return 1;
}
- llama_context_params ctx_params = llama_context_default_params();
-
- ctx_params.seed = 1234;
- ctx_params.n_ctx = n_kv_max;
- ctx_params.n_batch = n_batch;
- ctx_params.n_ubatch = n_ubatch;
- ctx_params.flash_attn = flash_attn;
-
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
return 1;
}
+ const int32_t n_kv_max = llama_n_ctx(ctx);
+
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
diff --git a/examples/batched/README.md b/examples/batched/README.md
index 5d7303317..ed204c308 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt
```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 591bc6e57..62d9b144d 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,48 +7,31 @@
#include
#include
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
- return 1 ;
+ params.prompt = "Hello my name is";
+ params.n_predict = 32;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
+
// number of parallel batches
- int n_parallel = 1;
+ int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
- int n_len = 32;
-
- // number of layers to offload to the GPU
- int n_gpu_layers = 0;
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- params.prompt = argv[2];
- }
-
- if (argc >= 4) {
- n_parallel = std::atoi(argv[3]);
- }
-
- if (argc >= 5) {
- n_len = std::atoi(argv[4]);
- }
-
- if (argc >= 6) {
- n_gpu_layers = std::atoi(argv[5]);
- }
-
- if (params.prompt.empty()) {
- params.prompt = "Hello my name is";
- }
-
- string_process_escapes(params.prompt);
+ int n_predict = 32;
// init LLM
@@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = n_gpu_layers;
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
std::vector tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
- ctx_params.n_batch = std::max(n_len, n_parallel);
- ctx_params.n_seq_max = n_parallel;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
return 1;
}
- const int n_ctx = llama_n_ctx(ctx);
+ const int n_ctx = llama_n_ctx(ctx);
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
@@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
- while (n_cur <= n_len) {
+ while (n_cur <= n_predict) {
// prepare the next batch
llama_batch_clear(batch);
@@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468b..000000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 3d34378a5..000000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include
-#include
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-# define NOMINMAX
-#endif
-#include
-#include
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
- llama_context * ctx;
- llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
- os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
- for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
- os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
- }
- return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
- llama_context * ctx;
- std::vector response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-// * Show progress by printing ',' following by number of convergent beam tokens if any.
-// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-// This is also called when the stop condition is met.
-// Collect tokens into std::vector response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
- auto& callback_data = *static_cast(callback_data_ptr);
- // Mark beams as EOS as needed.
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- llama_beam_view& beam_view = beams_state.beam_views[i];
- if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
- beam_view.eob = true;
- }
- }
- printf(","); // Show progress
- if (const size_t n = beams_state.common_prefix_length) {
- callback_data.response.resize(callback_data.response.size() + n);
- assert(0u < beams_state.n_beams);
- const llama_token * tokens = beams_state.beam_views[0].tokens;
- std::copy(tokens, tokens + n, callback_data.response.end() - n);
- printf("%zu", n);
- }
- fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
- std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- std::cout << "beams["< 3 )
- {
- params.prompt = argv[3];
- }
-
- if ( params.prompt.empty() )
- {
- params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
- }
-
- //---------------------------------
- // Init LLM :
- //---------------------------------
-
- llama_backend_init();
- llama_numa_init(params.numa);
-
- llama_model * model;
- llama_context * ctx;
-
- std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
- if ( model == NULL )
- {
- fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
- return 1;
- }
-
- //---------------------------------
- // Tokenize the prompt :
- //---------------------------------
-
- std::vector tokens_list = llama_tokenize(ctx, params.prompt, true);
-
- const size_t max_context_size = llama_n_ctx( ctx );
- const size_t max_tokens_list_size = max_context_size - 4 ;
-
- if (tokens_list.size() > max_tokens_list_size)
- {
- fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
- __func__ , tokens_list.size() , max_tokens_list_size );
- return 1;
- }
-
- fprintf( stderr, "\n\n" );
-
- // Print the tokens from the prompt :
-
- for( auto id : tokens_list )
- {
- std::cout << llama_token_to_piece(ctx, id);
- }
- std::cout << std::flush;
-
- int n_past = 0;
-
- if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
- {
- fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
- return 1;
- }
- n_past += tokens_list.size();
-
- beam_search_callback_data callback_data{ctx, {}};
- size_t const beam_width = static_cast(params.n_beams);
- int const n_predict = 256;
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
- std::cout << "\n\n";
- for (llama_token const token_id : callback_data.response) {
- std::cout << llama_token_to_piece(ctx,token_id);
- }
- std::cout << std::endl;
-
- llama_free( ctx );
- llama_free_model( model );
-
- llama_backend_free();
-
- return 0;
-}
diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py
index fd8401015..721a57c00 100755
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@@ -176,7 +176,7 @@ class Params:
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
- n_orig_ctx: int | None = None
+ n_ctx_orig: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None
@@ -226,7 +226,7 @@ class Params:
with open(config_path) as f:
config = json.load(f)
- rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+ rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -236,7 +236,7 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
- n_orig_ctx = rope_scaling['original_max_position_embeddings']
+ n_ctx_orig = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -272,7 +272,7 @@ class Params:
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
- n_orig_ctx = n_orig_ctx,
+ n_ctx_orig = n_ctx_orig,
rope_finetuned = rope_finetuned,
)
@@ -864,8 +864,8 @@ class OutputFile:
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
- if params.n_orig_ctx is not None:
- self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+ if params.n_ctx_orig is not None:
+ self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 004399b5f..244751e00 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 51d67d6d9..64cd338c2 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
}
int main(int argc, char ** argv) {
-
callback_data cb_data;
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 22425730f..71a4333ee 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0;
return ggml_rope_ext(ctx,
- t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
+ t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index e04feeae3..881f0451c 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = (size_t)n * 1024 * 1024; // megabytes
+ n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+ n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
@@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
- total_size = total_size / 1024 / 1024; // convert to megabytes
+ total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index 7ca6fa7f2..3bc0fa471 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
echo
# 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@@ -51,7 +51,7 @@ echo PASS
echo
# 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
@@ -61,7 +61,7 @@ echo PASS
echo
# 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
@@ -71,7 +71,7 @@ echo
#echo
# 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
@@ -81,7 +81,7 @@ echo PASS
echo
# 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 52fd719b3..213515791 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 458c01b87..866ca9f56 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage
```
-./imatrix -m -f [-o ] [--verbosity ]
- [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+./imatrix \
+ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
+ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 25a2351cc..e18f49563 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,39 +17,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s \\\n"
+ " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+ " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
+ " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+ LOG_TEE("\n");
+}
+
struct Stats {
std::vector values;
std::vector counts;
int ncall = 0;
};
-struct StatParams {
- std::string dataset;
- std::string ofile = "imatrix.dat";
- int n_output_frequency = 10;
- int verbosity = 1;
- int keep_every = 0;
- bool collect_output_weight = false;
-};
-
class IMatrixCollector {
public:
IMatrixCollector() = default;
- void set_parameters(StatParams&& params) { m_params = std::move(params); }
+ void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
- void save_imatrix() const;
- bool load_imatrix(const char * file_name, bool add);
- static bool load_imatrix(const char * file_name, std::unordered_map& imatrix);
+ void save_imatrix(int ncall = -1) const;
+ bool load_imatrix(const char * file_name);
private:
std::unordered_map m_stats;
- StatParams m_params;
+ gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector m_src1_data;
std::vector m_ids; // the expert ids from ggml_mul_mat_id
- //
- void save_imatrix(const char * file_name, const char * dataset) const;
- void keep_imatrix(int ncall) const;
};
// remove any prefix and suffixes from the name
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
- if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+ if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true;
}
@@ -153,21 +151,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
+ if (!std::isfinite(e.values[e_start + j])) {
+ fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+ exit(1);
+ }
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
- if (m_last_call % m_params.n_output_frequency == 0) {
+ if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
- keep_imatrix(m_last_call);
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+ save_imatrix(m_last_call);
}
}
}
} else {
- auto& e = m_stats[wname];
+ auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
@@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
+ if (!std::isfinite(e.values[j])) {
+ fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+ exit(1);
+ }
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
- if (m_last_call % m_params.n_output_frequency == 0) {
+ if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
- keep_imatrix(m_last_call);
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+ save_imatrix(m_last_call);
}
}
}
@@ -201,19 +207,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true;
}
-void IMatrixCollector::save_imatrix() const {
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+ auto fname = m_params.out_file;
+ if (fname.empty()) {
+ fname = "imatrix.dat";
+ }
-void IMatrixCollector::keep_imatrix(int ncall) const {
- auto file_name = m_params.ofile;
- if (file_name.empty()) file_name = "imatrix.dat";
- file_name += ".at_";
- file_name += std::to_string(ncall);
- save_imatrix(file_name.c_str(), m_params.dataset.c_str());
-}
+ if (ncall > 0) {
+ fname += ".at_";
+ fname += std::to_string(ncall);
+ }
-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries));
@@ -236,26 +240,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
- // Write the dataset name at the end of the file to later on specify it in quantize
- int n_dataset = strlen(dataset);
- out.write((const char *) &n_dataset, sizeof(n_dataset));
- out.write(dataset, n_dataset);
+ // Write the input filename at the end of the file to later on specify it in quantize
+ {
+ int len = m_params.prompt_file.size();
+ out.write((const char *) &len, sizeof(len));
+ out.write(m_params.prompt_file.c_str(), len);
+ }
if (m_params.verbosity > 0) {
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
}
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) {
- std::ifstream in(imatrix_file, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * fname) {
+ std::ifstream in(fname, std::ios::binary);
if (!in) {
- printf("%s: failed to open %s\n",__func__,imatrix_file);
+ printf("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
- printf("%s: no data in file %s\n", __func__, imatrix_file);
+ printf("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
@@ -263,23 +269,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+ printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
- auto& e = imatrix_data[std::move(name)];
+ auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
- imatrix_data = {};
+ m_stats = {};
return false;
}
- // When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
@@ -289,7 +294,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
- imatrix_data = {};
+ m_stats = {};
return false;
}
@@ -304,13 +309,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true;
}
-bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
- if (!add) {
- m_stats.clear();
- }
- return load_imatrix(file_name, m_stats);
-}
-
static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -324,7 +322,7 @@ struct results_log_softmax {
float prob;
};
-static std::vector softmax(const std::vector& logits) {
+static std::vector softmax(const std::vector & logits) {
std::vector probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
@@ -358,8 +356,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers,
- double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+ double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -391,8 +388,7 @@ static void process_logits(
}
}
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
-
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
@@ -405,13 +401,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count());
- if (from_chunk > 0) {
- if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
- fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+ if (params.i_chunk > 0) {
+ if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+ fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
- fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
- tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+ fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+ tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
@@ -424,7 +420,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector logit_history;
std::vector prob_history;
- if (compute_ppl) {
+ if (params.compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
@@ -446,7 +442,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector logits;
- if (compute_ppl && num_batches > 1) {
+ if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
@@ -482,7 +478,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
- if (compute_ppl && num_batches > 1) {
+ if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
@@ -501,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
- if (compute_ppl) {
+ if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -516,7 +512,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
printf("\n");
- if (compute_ppl) {
+ if (params.compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
@@ -533,111 +529,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
int main(int argc, char ** argv) {
-
- StatParams sparams;
- std::string prev_result_file;
- std::string combine_files;
- bool compute_ppl = true;
- int from_chunk = 0;
- std::vector args;
- args.push_back(argv[0]);
- int iarg = 1;
- for (; iarg < argc-1; ++iarg) {
- std::string arg{argv[iarg]};
- if (arg == "-o" || arg == "--output-file") {
- sparams.ofile = argv[++iarg];
- }
- else if (arg == "-ofreq" || arg == "--output-frequency") {
- sparams.n_output_frequency = std::stoi(argv[++iarg]);
- }
- else if (arg == "-ow" || arg == "--output-weight") {
- sparams.collect_output_weight = std::stoi(argv[++iarg]);
- }
- else if (arg == "--verbosity") {
- sparams.verbosity = std::stoi(argv[++iarg]);
- } else if (arg == "--no-ppl") {
- compute_ppl = false;
- } else if (arg == "--keep-imatrix") {
- sparams.keep_every = std::stoi(argv[++iarg]);
- } else if (arg == "--continue-from") {
- prev_result_file = argv[++iarg];
- } else if (arg == "--combine") {
- combine_files = argv[++iarg];
- }
- else if (arg == "--from-chunk") {
- from_chunk = std::stoi(argv[++iarg]);
- } else {
- args.push_back(argv[iarg]);
- }
- }
- if (iarg < argc) {
- std::string arg{argv[iarg]};
- if (arg == "--no-ppl") {
- compute_ppl = false;
- } else {
- args.push_back(argv[iarg]);
- }
- }
-
gpt_params params;
- params.n_batch = 512;
- if (!gpt_params_parse(args.size(), args.data(), params)) {
+
+ params.n_ctx = 512;
+ params.logits_all = true;
+ params.verbosity = 1;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
return 1;
}
- params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
- print_build_info();
+ g_collector.set_params(params);
- if (params.seed == LLAMA_DEFAULT_SEED) {
- params.seed = time(NULL);
- }
-
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
-
- sparams.dataset = params.prompt_file;
- g_collector.set_parameters(std::move(sparams));
-
- if (!combine_files.empty()) {
- std::vector files;
- size_t pos = 0;
- while (true) {
- auto new_pos = combine_files.find(',', pos);
- if (new_pos != std::string::npos) {
- files.emplace_back(combine_files.substr(pos, new_pos - pos));
- pos = new_pos + 1;
- } else {
- files.emplace_back(combine_files.substr(pos));
- break;
- }
- }
- if (files.size() < 2) {
- fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+ for (const auto & in_file : params.in_files) {
+ printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+ if (!g_collector.load_imatrix(in_file.c_str())) {
+ fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
- printf("Combining the following %d files\n", int(files.size()));
- for (auto& file : files) {
- printf(" %s\n", file.c_str());
- if (!g_collector.load_imatrix(file.c_str(), true)) {
- fprintf(stderr, "Failed to load %s\n", file.c_str());
- return 1;
- }
- }
+ }
+
+ if (params.in_files.size() > 1) {
+ printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
- return 0;
- }
-
- if (!prev_result_file.empty()) {
- if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
- fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
- return 1;
- }
}
llama_backend_init();
@@ -652,6 +569,7 @@ int main(int argc, char ** argv) {
// init
llama_model * model;
llama_context * ctx;
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
@@ -670,8 +588,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
- bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
- if (!OK) {
+ if (!compute_imatrix(ctx, params)) {
return 1;
}
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 539f78184..0e4ec79c6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
- if (params.instruct) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for instruct mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (params.chatml) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for chatml mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.antiprompt.empty()) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
return 0;
}
- if (params.random_prompt) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.path_prompt_cache.empty()) {
- printf("\n************\n");
- printf("%s: infill does not support prompt caching\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
- llama_context * ctx_guidance = NULL;
+
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (sparams.cfg_scale > 1.f) {
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
- ctx_guidance = llama_new_context_with_model(model, lparams);
- }
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
- // Tokenize negative prompt
- std::vector guidance_inp;
- int guidance_offset = 0;
- int original_prompt_len = 0;
- if (ctx_guidance) {
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
- std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true);
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
- original_prompt_len = original_inp.size();
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
- }
-
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
@@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
- if (ctx_guidance) {
- LOG_TEE("\n");
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
- }
- }
-
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
@@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
- bool input_echo = true;
+ bool input_echo = true;
- int n_past = 0;
- int n_remain = params.n_predict;
- int n_consumed = 0;
- int n_past_guidance = 0;
+ int n_past = 0;
+ int n_remain = params.n_predict;
+ int n_consumed = 0;
std::vector input_tokens; g_input_tokens = &input_tokens;
std::vector output_tokens; g_output_tokens = &output_tokens;
@@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
console::set_display(console::prompt);
std::vector embd;
- std::vector embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
- if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) {
+ if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
- if (ctx_guidance) {
- n_past_guidance -= n_discard;
- }
-
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+ LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
@@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
-
- if (ctx_guidance) {
- int input_size = 0;
- llama_token * input_buf = NULL;
-
- if (n_past_guidance < (int) guidance_inp.size()) {
- // Guidance context should have the same data with these modifications:
- //
- // * Replace the initial prompt
- // * Shift everything by guidance_offset
- embd_guidance = guidance_inp;
- if (embd.begin() + original_prompt_len < embd.end()) {
- embd_guidance.insert(
- embd_guidance.end(),
- embd.begin() + original_prompt_len,
- embd.end()
- );
- }
-
- input_buf = embd_guidance.data();
- input_size = embd_guidance.size();
-
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
- } else {
- input_buf = embd.data();
- input_size = embd.size();
- }
-
- for (int i = 0; i < input_size; i += params.n_batch) {
- int n_eval = std::min(input_size - i, params.n_batch);
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
- return 1;
- }
-
- n_past_guidance += n_eval;
- }
- }
-
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
@@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
}
embd.clear();
- embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+ const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
@@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
-
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
@@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd.clear();
- embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
@@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
- if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 826cd3f72..7d889c3fe 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,52 +6,22 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
+
+ if min_items == 0 and max_items == 1:
+ return f'{item_rule}?'
+
if not separator_rule:
- if min_items == 0 and max_items == 1:
- return f'{item_rule}?'
- elif min_items == 1 and max_items is None:
+ if min_items == 1 and max_items is None:
return f'{item_rule}+'
-
- result = ''
-
- if min_items > 0:
- if item_rule_is_literal and separator_rule is None:
- result = '"' + (item_rule[1:-1] * min_items) + '"'
+ elif min_items == 0 and max_items is None:
+ return f'{item_rule}*'
else:
- result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+ return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
- def opt_repetitions(up_to_n, prefix_with_sep=False):
- '''
- - n=4, no sep: '(a (a (a (a)?)?)?)?'
- - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
- '''
-
- content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
- if up_to_n == 0:
- return ''
- elif up_to_n == 1:
- return f'({content})?'
- elif separator_rule and not prefix_with_sep:
- return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
- else:
- return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
-
- if min_items > 0 and max_items != min_items:
- result += ' '
-
- if max_items is not None:
- result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
- else:
- item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
-
- if min_items == 0 and separator_rule:
- result = f'({item_rule} {item_operator}*)?'
- else:
- result += f'{item_operator}*'
-
- return result
+ result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
+ return f'({result})?' if min_items == 0 else result
class BuiltinRule:
@@ -59,31 +29,29 @@ class BuiltinRule:
self.content = content
self.deps = deps or []
-_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
-
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
- 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
- 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+ 'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
+ 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
- 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
- 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+ 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+ 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
- 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
- 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
@@ -333,7 +301,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id
sub = id
- seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+ seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
else:
literal = ''
while i < length:
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 857840564..fd95b35f4 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
```
```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
- "opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
- "opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
- opencl INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index c00890447..5c31548a6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -41,20 +41,6 @@ static std::string join(const std::vector & values, const std::string & delim
return str.str();
}
-template
-static std::vector split(const std::string & str, char delim) {
- std::vector values;
- std::istringstream str_stream(str);
- std::string token;
- while (std::getline(str_stream, token, delim)) {
- T value;
- std::istringstream token_stream(token);
- token_stream >> value;
- values.push_back(value);
- }
- return values;
-}
-
template
static std::vector transform_to_str(const std::vector & values, F f) {
std::vector str_values;
@@ -140,10 +126,11 @@ static std::string get_gpu_info() {
}
// command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
+ case NONE: return "none";
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
@@ -152,6 +139,23 @@ static const char * output_format_str(output_formats format) {
}
}
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+ if (s == "none") {
+ format = NONE;
+ } else if (s == "csv") {
+ format = CSV;
+ } else if (s == "json") {
+ format = JSON;
+ } else if (s == "md") {
+ format = MARKDOWN;
+ } else if (s == "sql") {
+ format = SQL;
+ } else {
+ return false;
+ }
+ return true;
+}
+
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
@@ -190,31 +194,33 @@ struct cmd_params {
int reps;
bool verbose;
output_formats output_format;
+ output_formats output_format_stderr;
};
static const cmd_params cmd_params_defaults = {
- /* model */ {"models/7B/ggml-model-q4_0.gguf"},
- /* n_prompt */ {512},
- /* n_gen */ {128},
- /* n_pg */ {},
- /* n_batch */ {2048},
- /* n_ubatch */ {512},
- /* type_k */ {GGML_TYPE_F16},
- /* type_v */ {GGML_TYPE_F16},
- /* n_threads */ {cpu_get_num_math()},
- /* n_gpu_layers */ {99},
- /* rpc_servers */ {""},
- /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
- /* main_gpu */ {0},
- /* no_kv_offload */ {false},
- /* flash_attn */ {false},
- /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
- /* use_mmap */ {true},
- /* embeddings */ {false},
- /* numa */ GGML_NUMA_STRATEGY_DISABLED,
- /* reps */ 5,
- /* verbose */ false,
- /* output_format */ MARKDOWN
+ /* model */ {"models/7B/ggml-model-q4_0.gguf"},
+ /* n_prompt */ {512},
+ /* n_gen */ {128},
+ /* n_pg */ {},
+ /* n_batch */ {2048},
+ /* n_ubatch */ {512},
+ /* type_k */ {GGML_TYPE_F16},
+ /* type_v */ {GGML_TYPE_F16},
+ /* n_threads */ {cpu_get_num_math()},
+ /* n_gpu_layers */ {99},
+ /* rpc_servers */ {""},
+ /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
+ /* main_gpu */ {0},
+ /* no_kv_offload */ {false},
+ /* flash_attn */ {false},
+ /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
+ /* use_mmap */ {true},
+ /* embeddings */ {false},
+ /* numa */ GGML_NUMA_STRATEGY_DISABLED,
+ /* reps */ 5,
+ /* verbose */ false,
+ /* output_format */ MARKDOWN,
+ /* output_format_stderr */ NONE,
};
static void print_usage(int /* argc */, char ** argv) {
@@ -243,6 +249,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ts, --tensor-split (default: 0)\n");
printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+ printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -284,6 +291,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
+ params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
@@ -300,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], ',');
+ auto p = string_split(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
@@ -332,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@@ -362,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@@ -378,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
@@ -398,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector modes;
for (const auto & m : p) {
llama_split_mode mode;
@@ -420,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- params.main_gpu = split(argv[i], split_delim);
+ params.main_gpu = string_split(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
@@ -444,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
- for (auto ts : split(argv[i], split_delim)) {
+ for (auto ts : string_split(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@@ -493,18 +501,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- if (argv[i] == std::string("csv")) {
- params.output_format = CSV;
- } else if (argv[i] == std::string("json")) {
- params.output_format = JSON;
- } else if (argv[i] == std::string("md")) {
- params.output_format = MARKDOWN;
- } else if (argv[i] == std::string("sql")) {
- params.output_format = SQL;
- } else {
+ invalid_param = !output_format_from_str(argv[i], params.output_format);
+ } else if (arg == "-oe" || arg == "--output-err") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
+ invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
@@ -706,7 +709,6 @@ struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
- static const bool opencl;
static const bool vulkan;
static const bool kompute;
static const bool metal;
@@ -795,9 +797,6 @@ struct test {
if (cuda) {
return GGML_CUDA_NAME;
}
- if (opencl) {
- return "OpenCL";
- }
if (vulkan) {
return "Vulkan";
}
@@ -826,7 +825,7 @@ struct test {
static const std::vector & get_fields() {
static const std::vector fields = {
"build_commit", "build_number",
- "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+ "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
@@ -852,7 +851,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
- if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
+ if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
@@ -881,7 +880,7 @@ struct test {
}
std::vector values = {
build_commit, std::to_string(build_number),
- std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+ std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
@@ -910,7 +909,6 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
-const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal();
@@ -1278,6 +1276,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
(void) user_data;
}
+static std::unique_ptr create_printer(output_formats format) {
+ switch (format) {
+ case NONE:
+ return nullptr;
+ case CSV:
+ return std::unique_ptr(new csv_printer());
+ case JSON:
+ return std::unique_ptr(new json_printer());
+ case MARKDOWN:
+ return std::unique_ptr(new markdown_printer());
+ case SQL:
+ return std::unique_ptr(new sql_printer());
+ }
+ GGML_ASSERT(false);
+}
+
int main(int argc, char ** argv) {
// try to set locale for unicode characters in markdown
setlocale(LC_CTYPE, ".UTF-8");
@@ -1304,26 +1318,18 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa);
// initialize printer
- std::unique_ptr p;
- switch (params.output_format) {
- case CSV:
- p.reset(new csv_printer());
- break;
- case JSON:
- p.reset(new json_printer());
- break;
- case MARKDOWN:
- p.reset(new markdown_printer());
- break;
- case SQL:
- p.reset(new sql_printer());
- break;
- default:
- assert(false);
- exit(1);
+ std::unique_ptr p = create_printer(params.output_format);
+ std::unique_ptr p_err = create_printer(params.output_format_stderr);
+
+ if (p) {
+ p->fout = stdout;
+ p->print_header(params);
+ }
+
+ if (p_err) {
+ p_err->fout = stderr;
+ p_err->print_header(params);
}
- p->fout = stdout;
- p->print_header(params);
std::vector params_instances = get_cmd_params_instances(params);
@@ -1381,7 +1387,15 @@ int main(int argc, char ** argv) {
t.samples_ns.push_back(t_ns);
}
- p->print_test(t);
+ if (p) {
+ p->print_test(t);
+ fflush(p->fout);
+ }
+
+ if (p_err) {
+ p_err->print_test(t);
+ fflush(p_err->fout);
+ }
llama_print_timings(ctx);
@@ -1390,7 +1404,13 @@ int main(int argc, char ** argv) {
llama_free_model(lmodel);
- p->print_footer();
+ if (p) {
+ p->print_footer();
+ }
+
+ if (p_err) {
+ p_err->print_footer();
+ }
llama_backend_free();
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index c974900f2..8c7dd2ae3 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,12 @@ struct llava_context {
struct llama_model * model = NULL;
};
-static void show_additional_info(int /*argc*/, char ** argv) {
- LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
- LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\n example usage:\n");
+ LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+ LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
- show_additional_info(argc, argv);
+ print_usage(argc, argv, params);
return 1;
}
@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
- gpt_params_print_usage(argc, argv, params);
- show_additional_info(argc, argv);
+ print_usage(argc, argv, {});
return 1;
}
auto model = llava_init(¶ms);
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 54f060a85..fb20ad93f 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,7 +37,8 @@ struct ngram_container {
int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 1c230c966..d713f6f21 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -14,8 +14,10 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 87ecc0a4f..0b171c872 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -16,6 +16,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 83dbee91a..80ecd925d 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -15,6 +15,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
index edf20d8db..a88e92f23 100644
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations
-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast packageโwhich was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
### Build llama.cpp and install to C:\LlamaCPP directory
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
@@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
```cmd
cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/MyLlamaApp
```
diff --git a/examples/main/README.md b/examples/main/README.md
index ee930f4e7..cdc002f15 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
#### Unix-based systems (Linux, macOS, etc.):
```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
```
## Common Options
@@ -69,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
@@ -80,11 +79,10 @@ The `main` program provides several ways to interact with the LLaMA models using
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-- `--random-prompt`: Start with a randomized prompt.
## Interaction
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
@@ -92,7 +90,6 @@ In interactive mode, users can participate in text generation by injecting their
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -121,16 +118,6 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
-### Instruction Mode
-
-Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
-
-- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
-
-Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
-
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
-
## Context Management
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 44949ba86..b97b7b793 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
llama_sampling_params & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
@@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
@@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
std::vector embd_inp;
- if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
+ if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
- if (params.chatml) {
- params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
- }
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else {
LOG("use session tokens\n");
@@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
} else {
params.n_keep += add_bos; // always keep the BOS token
}
- // prefix & suffix for instruct mode
- const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
- const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
-
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
- // chatml prefix & suffix
- const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
- const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
-
- LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
- LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
-
- // in instruct mode, we inject a prefix and a suffix to each input by the user
- if (params.instruct) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("### Instruction:\n\n");
- }
- // similar for chatml mode
- else if (params.chatml) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("<|im_start|>user\n");
- }
- else if (params.conversation) {
+ if (params.conversation) {
params.interactive_first = true;
}
@@ -823,15 +795,13 @@ int main(int argc, char ** argv) {
is_interacting = true;
printf("\n");
- } else if (params.instruct || params.chatml) {
- is_interacting = true;
}
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
- if (params.conversation || params.instruct || params.chatml) {
+ if (params.conversation) {
printf("\n> ");
}
@@ -874,24 +844,12 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size();
- // instruct mode: insert instruction prefix
- if (params.instruct && !is_antiprompt) {
- LOG("inserting instruction prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
- }
- // chatml mode: insert user chat prefix
- if (params.chatml && !is_antiprompt) {
- LOG("inserting chatml prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
- }
if (params.escape) {
string_process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -900,17 +858,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
- // instruct mode: insert response suffix
- if (params.instruct) {
- LOG("inserting instruction suffix\n");
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
- }
- // chatml mode: insert assistant chat suffix
- if (params.chatml) {
- LOG("inserting chatml suffix\n");
- embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
- }
-
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
@@ -935,7 +882,7 @@ int main(int argc, char ** argv) {
}
// end of generation
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index c731abb72..7faeaec97 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 4a22bb559..9e7a119ba 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
### Usage
```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index f2ef9ca10..d03215cd1 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,46 +6,32 @@
#include
#include
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
- return 1 ;
+ params.n_junk = 250;
+ params.n_keep = 32;
+ params.i_pos = -1;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- int seed = -1;
+ srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
- int n_junk = 250; // number of times to repeat the junk text
- int n_keep = 32; // number of tokens in the prompt prefix
- int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
- int i_pos = -1; // position of the passkey in the junk text
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_junk = std::stoi(argv[2]);
- }
-
- if (argc >= 4) {
- n_grp = std::stoi(argv[3]);
- }
-
- if (argc >= 5) {
- i_pos = std::stoi(argv[4]);
- }
-
- if (argc >= 6) {
- seed = std::stoi(argv[5]);
- }
-
- if (seed == -1) {
- seed = time(NULL);
- }
-
- srand(seed);
+ int n_junk = params.n_junk;
+ int n_keep = params.n_keep;
+ int n_grp = params.grp_attn_n;
+ int i_pos = params.i_pos;
if (i_pos == -1) {
i_pos = rand() % n_junk;
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = 99; // offload all layers to the GPU
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = seed;
- ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
- ctx_params.n_batch = 512;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
- llama_batch batch = llama_batch_init(512, 0, 1);
+ llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
int n_past = 0;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 30e5e282e..0bd78c21a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
std::vector seq_tokens[2];
};
-static std::vector load_winogrande_from_csv(const std::string& prompt) {
+static std::vector load_winogrande_from_csv(const std::string & prompt) {
std::vector result;
std::istringstream in(prompt);
std::string line;
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
+ params.n_ctx = 512;
+ params.logits_all = true;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
- params.logits_all = true;
-
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
@@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = string_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
@@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
+
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 9acc7cc6d..f029c73a2 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -624,7 +624,7 @@ string ::= "\"" (
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" ws
ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+"""
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
index a3ca74c68..38e28ffc3 100644
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -47,7 +47,7 @@ echo PASS
echo
# 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@@ -57,7 +57,7 @@ echo PASS
echo
# 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
echo PASS
echo
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 4e7530706..55b7b2f70 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,72 +4,12 @@
#include
#include
-struct retrieval_params {
- std::vector context_files; // context files to embed
- int32_t chunk_size = 64; // chunk size for context embedding
- std::string chunk_separator = "\n"; // chunk separator for context embedding
-};
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
- gpt_params_print_usage(argc, argv, gpt_params);
- printf("retrieval options:\n");
- printf(" --context-file FNAME file containing context to embed.\n");
- printf(" specify multiple files by providing --context-file option multiple times.\n");
- printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
- printf(" --chunk-separator STRING\n");
- printf(" string to separate chunks (default: \"\\n\")\n");
- printf("\n");
-}
-
-static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
- int i = 1;
- std::string arg;
- while (i < argc) {
- arg = argv[i];
- bool invalid_gpt_param = false;
- if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
- if (invalid_gpt_param) {
- fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- // option was parsed by gpt_params_find_arg
- } else if (arg == "--context-file") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --context-file\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- // store the external file name in params
- retrieval_params.context_files.push_back(argv[i]);
- } else if (arg == "--chunk-size") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --chunk-size\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- retrieval_params.chunk_size = std::stoi(argv[i]);
- } else if (arg == "--chunk-separator") {
- if (++i >= argc) {
- fprintf(stderr, "error: missing argument for --chunk-separator\n");
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- retrieval_params.chunk_separator = argv[i];
- } else {
- // unknown argument
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
- exit(1);
- }
- i++;
- }
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+ LOG_TEE("\n");
}
struct chunk {
@@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) {
gpt_params params;
- retrieval_params retrieval_params;
- retrieval_params_parse(argc, argv, params, retrieval_params);
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
+ }
// For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
+ params.embedding = true;
- if (retrieval_params.chunk_size <= 0) {
+ if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n");
return 1;
}
- if (retrieval_params.context_files.empty()) {
+ if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n");
return 1;
}
- params.embedding = true;
print_build_info();
printf("processing files:\n");
- for (auto & context_file : retrieval_params.context_files) {
+ for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str());
}
std::vector chunks;
- for (auto & context_file : retrieval_params.context_files) {
- std::vector file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+ for (auto & context_file : params.context_files) {
+ std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
}
printf("Number of chunks: %ld\n", chunks.size());
@@ -242,7 +184,7 @@ int main(int argc, char ** argv) {
return 1;
}
// add eos if not present
- if (inp.empty() || inp.back() != llama_token_eos(model)) {
+ if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
inp.push_back(llama_token_eos(model));
}
chunk.tokens = inp;
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index c3b766882..00c2277ac 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/server/README.md b/examples/server/README.md
index 0c3db8c84..ccbdcdbdb 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -279,7 +279,7 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
- `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false`
+ `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 8e0be1b40..cef11eab8 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -2,57 +2,26 @@
const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+ if (minItems === 0 && maxItems === 1) {
+ return `${itemRule}?`;
+ }
+
+
const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') {
- if (minItems === 0 && maxItems === 1) {
- return `${itemRule}?`;
- } else if (minItems === 1 && maxItems === undefined) {
+ if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`;
- }
- }
-
- let result = '';
- if (minItems > 0) {
- if (itemRuleIsLiteral && separatorRule === '') {
- result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
+ } else if (minItems === 0 && maxItems === undefined) {
+ return `${itemRule}*`;
} else {
- result = Array.from({ length: minItems }, () => itemRule)
- .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
+ return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
}
}
- const optRepetitions = (upToN, prefixWithSep=false) => {
- const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
- if (upToN === 0) {
- return '';
- } else if (upToN === 1) {
- return `(${content})?`;
- } else if (separatorRule !== '' && !prefixWithSep) {
- return `(${content} ${optRepetitions(upToN - 1, true)})?`;
- } else {
- return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
- }
- };
-
- if (minItems > 0 && maxItems !== minItems) {
- result += ' ';
- }
-
- if (maxItems !== undefined) {
- result += optRepetitions(maxItems - minItems, minItems > 0);
- } else {
- const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
-
- if (minItems === 0 && separatorRule !== '') {
- result = `(${itemRule} ${itemOperator}*)?`;
- } else {
- result += `${itemOperator}*`;
- }
- }
-
- return result;
+ const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
+ return minItems === 0 ? `(${result})?` : result;
}
class BuiltinRule {
@@ -62,27 +31,25 @@ class BuiltinRule {
}
}
-const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
-
const PRIMITIVE_RULES = {
boolean : new BuiltinRule('("true" | "false") space', []),
- 'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
- 'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+ 'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
+ 'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
- uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
- char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+ uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
+ char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};
// TODO: support "uri", "email" string formats
const STRING_FORMAT_RULES = {
- 'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
- 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fc6d90848..6ffaa8d9f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -123,29 +123,6 @@ struct slot_params {
json input_suffix;
};
-struct server_params {
- int32_t port = 8080;
- int32_t read_timeout = 600;
- int32_t write_timeout = 600;
- int32_t n_threads_http = -1;
-
- std::string hostname = "127.0.0.1";
- std::string public_path = "";
- std::string chat_template = "";
- std::string system_prompt = "";
-
- std::vector api_keys;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- std::string ssl_key_file = "";
- std::string ssl_cert_file = "";
-#endif
-
- bool slots_endpoint = true;
- bool metrics_endpoint = false;
- std::string slot_save_path;
-};
-
struct server_slot {
int id;
int id_task = -1;
@@ -670,6 +647,9 @@ struct server_context {
server_metrics metrics;
+ // Necessary similarity of prompt for slot selection
+ float slot_prompt_similarity = 0.0f;
+
~server_context() {
if (ctx) {
llama_free(ctx);
@@ -818,24 +798,88 @@ struct server_context {
return prompt_tokens;
}
- server_slot * get_slot(int id) {
- int64_t t_last = ggml_time_us();
-
- server_slot * last_used = nullptr;
-
+ server_slot * get_slot_by_id(int id) {
for (server_slot & slot : slots) {
- if (slot.id == id && slot.available()) {
+ if (slot.id == id) {
return &slot;
}
-
- // among all available slots, find the one that has been least recently used
- if (slot.available() && slot.t_last_used < t_last) {
- last_used = &slot;
- t_last = slot.t_last_used;
- }
}
- return last_used;
+ return nullptr;
+ }
+
+ server_slot * get_available_slot(const std::string & prompt) {
+ server_slot * ret = nullptr;
+
+ // find the slot that has at least n% prompt similarity
+ if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
+ int max_lcp_len = 0;
+ float similarity = 0;
+
+ for (server_slot & slot : slots) {
+ // skip the slot if it is not available
+ if (!slot.available()) {
+ continue;
+ }
+
+ // skip the slot if it does not contains prompt
+ if (!slot.prompt.is_string()) {
+ continue;
+ }
+
+ // current slot's prompt
+ std::string slot_prompt = slot.prompt.get();
+
+ // length of the current slot's prompt
+ int slot_prompt_len = slot_prompt.size();
+
+ // length of the Longest Common Prefix between the current slot's prompt and the input prompt
+ int lcp_len = common_part(slot_prompt, prompt);
+
+ // fraction of the common substring length compared to the current slot's prompt length
+ similarity = static_cast(lcp_len) / slot_prompt_len;
+
+ // select the current slot if the criteria match
+ if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
+ max_lcp_len = lcp_len;
+ ret = &slot;
+ }
+ }
+
+ if (ret != nullptr) {
+ LOG_VERBOSE("selected slot by lcp similarity", {
+ {"id_slot", ret->id},
+ {"max_lcp_len", max_lcp_len},
+ {"similarity", similarity},
+ });
+ }
+ }
+
+ // find the slot that has been least recently used
+ if (ret == nullptr) {
+ int64_t t_last = ggml_time_us();
+ for (server_slot & slot : slots) {
+ // skip the slot if it is not available
+ if (!slot.available()) {
+ continue;
+ }
+
+ // select the current slot if the criteria match
+ if (slot.t_last_used < t_last) {
+ t_last = slot.t_last_used;
+ ret = &slot;
+ }
+ }
+
+ if (ret != nullptr) {
+ LOG_VERBOSE("selected slot by lru", {
+ {"id_slot", ret->id},
+ {"t_last", t_last},
+ });
+ }
+ }
+
+ return ret;
}
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@@ -911,7 +955,7 @@ struct server_context {
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
// get prompt
- {
+ if (!task.infill) {
const auto & prompt = data.find("prompt");
if (prompt == data.end()) {
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@@ -1261,7 +1305,7 @@ struct server_context {
}
json get_formated_generation(const server_slot & slot) const {
- const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+ const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector samplers_sequence;
@@ -1538,13 +1582,29 @@ struct server_context {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
{
- server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+ int id_slot = json_value(task.data, "id_slot", -1);
+ std::string prompt = json_value(task.data, "prompt", std::string());
+
+ server_slot * slot;
+
+ if (id_slot != -1) {
+ slot = get_slot_by_id(id_slot);
+ } else {
+ slot = get_available_slot(prompt);
+ }
+
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
if (task.data.contains("system_prompt")) {
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@@ -1661,11 +1721,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_SAVE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
@@ -1696,11 +1762,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const int64_t t_start = ggml_time_us();
@@ -1738,11 +1810,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_ERASE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
@@ -2334,561 +2412,6 @@ struct server_context {
}
};
-static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
- printf("usage: %s [options]\n", argv0);
- printf("\n");
- printf("options:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
- printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
- printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
- printf(" --rope-scaling {none,linear,yarn}\n");
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
- printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
- printf(" -dt N, --defrag-thold N\n");
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
- printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
- if (llama_supports_mlock()) {
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
- }
- if (llama_supports_mmap()) {
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
- }
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
- printf(" - distribute: spread execution evenly over all nodes\n");
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
- printf(" - numactl: use the CPU map provided my numactl\n");
- if (llama_supports_gpu_offload()) {
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row)\n");
- printf(" -nkvo, --no-kv-offload\n");
- printf(" disable KV offload\n");
- }
- printf(" -m FNAME, --model FNAME\n");
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: unused)\n");
- printf(" -hfr REPO, --hf-repo REPO\n");
- printf(" Hugging Face model repository (default: unused)\n");
- printf(" -hff FILE, --hf-file FILE\n");
- printf(" Hugging Face model file (default: unused)\n");
- printf(" -a ALIAS, --alias ALIAS\n");
- printf(" set an alias for the model, will be added as `model` field in completion response\n");
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
- printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
- printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
- printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
- printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
- printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
- printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
-#endif
- printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
- printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
- printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
- printf(" -spf FNAME, --system-prompt-file FNAME\n");
- printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
- printf(" KV cache data type for K (default: f16)\n");
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
- printf(" KV cache data type for V (default: f16)\n");
- printf(" --log-format log output format: json or text (default: json)\n");
- printf(" --log-disable disables logging to a file.\n");
- printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
- printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
- printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
- printf("\n");
- printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
- printf(" --override-kv KEY=TYPE:VALUE\n");
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
- printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
- printf(" --chat-template JINJA_TEMPLATE\n");
- printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
- printf(" only commonly used templates are accepted:\n");
- printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
- printf("\n");
-}
-
-static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
- gpt_params default_params;
- server_params default_sparams;
-
- std::string arg;
- bool invalid_param = false;
-
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
- if (arg == "--port") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.port = std::stoi(argv[i]);
- } else if (arg == "--rpc") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rpc_servers = argv[i];
- } else if (arg == "--host") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.hostname = argv[i];
- } else if (arg == "--path") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.public_path = argv[i];
- } else if (arg == "--api-key") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.api_keys.push_back(argv[i]);
- } else if (arg == "--api-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream key_file(argv[i]);
- if (!key_file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::string key;
- while (std::getline(key_file, key)) {
- if (key.size() > 0) {
- sparams.api_keys.push_back(key);
- }
- }
- key_file.close();
-
- }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- else if (arg == "--ssl-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.ssl_key_file = argv[i];
- } else if (arg == "--ssl-cert-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.ssl_cert_file = argv[i];
- }
-#endif
- else if (arg == "--timeout" || arg == "-to") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.read_timeout = std::stoi(argv[i]);
- sparams.write_timeout = std::stoi(argv[i]);
- } else if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model = argv[i];
- } else if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_url = argv[i];
- } else if (arg == "-hfr" || arg == "--hf-repo") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.hf_repo = argv[i];
- } else if (arg == "-hff" || arg == "--hf-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.hf_file = argv[i];
- } else if (arg == "-a" || arg == "--alias") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_alias = argv[i];
- } else if (arg == "-h" || arg == "--help") {
- server_print_usage(argv[0], default_params, default_sparams);
- exit(0);
- } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ctx = std::stoi(argv[i]);
- } else if (arg == "--rope-scaling") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
- else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
- else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
- else { invalid_param = true; break; }
- } else if (arg == "--rope-freq-base") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_base = std::stof(argv[i]);
- } else if (arg == "--rope-freq-scale") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_scale = std::stof(argv[i]);
- } else if (arg == "--yarn-ext-factor") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_ext_factor = std::stof(argv[i]);
- }
- else if (arg == "--yarn-attn-factor") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_attn_factor = std::stof(argv[i]);
- } else if (arg == "--yarn-beta-fast") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_fast = std::stof(argv[i]);
- } else if (arg == "--yarn-beta-slow") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_slow = std::stof(argv[i]);
- } else if (arg == "--pooling") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
- else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
- else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
- else { invalid_param = true; break; }
- } else if (arg == "--defrag-thold" || arg == "-dt") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.defrag_thold = std::stof(argv[i]);
- } else if (arg == "--threads" || arg == "-t") {
- if (++i >= argc)
- {
- invalid_param = true;
- break;
- }
- params.n_threads = std::stoi(argv[i]);
- } else if (arg == "--grp-attn-n" || arg == "-gan") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_n = std::stoi(argv[i]);
- } else if (arg == "--grp-attn-w" || arg == "-gaw") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_w = std::stoi(argv[i]);
- } else if (arg == "--threads-batch" || arg == "-tb") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads_batch = std::stoi(argv[i]);
- } else if (arg == "--threads-http") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.n_threads_http = std::stoi(argv[i]);
- } else if (arg == "-b" || arg == "--batch-size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_batch = std::stoi(argv[i]);
- } else if (arg == "-ub" || arg == "--ubatch-size") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ubatch = std::stoi(argv[i]);
- } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (llama_supports_gpu_offload()) {
- params.n_gpu_layers = std::stoi(argv[i]);
- } else {
- LOG_WARNING(
- "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
- "See main README.md for information on enabling GPU BLAS support",
- {{"n_gpu_layers", params.n_gpu_layers}});
- }
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
- params.no_kv_offload = true;
- } else if (arg == "--split-mode" || arg == "-sm") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string arg_next = argv[i];
- if (arg_next == "none") {
- params.split_mode = LLAMA_SPLIT_MODE_NONE;
- } else if (arg_next == "layer") {
- params.split_mode = LLAMA_SPLIT_MODE_LAYER;
- } else if (arg_next == "row") {
- params.split_mode = LLAMA_SPLIT_MODE_ROW;
- } else {
- invalid_param = true;
- break;
- }
-#ifndef GGML_USE_CUDA
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
- } else if (arg == "--tensor-split" || arg == "-ts") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- std::string arg_next = argv[i];
-
- // split string by , and /
- const std::regex regex{R"([,/]+)"};
- std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
- std::vector split_arg{it, {}};
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
-
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
- if (i_device < split_arg.size()) {
- params.tensor_split[i_device] = std::stof(split_arg[i_device]);
- } else {
- params.tensor_split[i_device] = 0.0f;
- }
- }
-#else
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
- } else if (arg == "--main-gpu" || arg == "-mg") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
- params.main_gpu = std::stoi(argv[i]);
-#else
- LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
-#endif
- } else if (arg == "--lora") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(argv[i], 1.0f);
- params.use_mmap = false;
- } else if (arg == "--lora-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- const char * lora_adapter = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
- params.use_mmap = false;
- } else if (arg == "--lora-base") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_base = argv[i];
- } else if (arg == "-v" || arg == "--verbose") {
-#if SERVER_VERBOSE != 1
- LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
- server_verbose = true;
-#endif
- } else if (arg == "--mlock") {
- params.use_mlock = true;
- } else if (arg == "--no-mmap") {
- params.use_mmap = false;
- } else if (arg == "--numa") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- } else {
- std::string value(argv[i]);
- /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
- else { invalid_param = true; break; }
- }
- } else if (arg == "--embedding" || arg == "--embeddings") {
- params.embedding = true;
- } else if (arg == "-cb" || arg == "--cont-batching") {
- params.cont_batching = true;
- } else if (arg == "-fa" || arg == "--flash-attn") {
- params.flash_attn = true;
- } else if (arg == "-np" || arg == "--parallel") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_parallel = std::stoi(argv[i]);
- } else if (arg == "-n" || arg == "--n-predict") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_predict = std::stoi(argv[i]);
- } else if (arg == "-spf" || arg == "--system-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::string system_prompt;
- std::copy(
- std::istreambuf_iterator(file),
- std::istreambuf_iterator(),
- std::back_inserter(system_prompt)
- );
- sparams.system_prompt = system_prompt;
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
- params.cache_type_k = argv[++i];
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
- params.cache_type_v = argv[++i];
- } else if (arg == "--log-format") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (std::strcmp(argv[i], "json") == 0) {
- server_log_json = true;
- } else if (std::strcmp(argv[i], "text") == 0) {
- server_log_json = false;
- } else {
- invalid_param = true;
- break;
- }
- } else if (arg == "--log-disable") {
- log_set_target(stdout);
- LOG_INFO("logging to file is disabled.", {});
- } else if (arg == "--slots-endpoint-disable") {
- sparams.slots_endpoint = false;
- } else if (arg == "--metrics") {
- sparams.metrics_endpoint = true;
- } else if (arg == "--slot-save-path") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.slot_save_path = argv[i];
- // if doesn't end with DIRECTORY_SEPARATOR, add it
- if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
- sparams.slot_save_path += DIRECTORY_SEPARATOR;
- }
- } else if (arg == "--chat-template") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (!verify_custom_template(argv[i])) {
- fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
- fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
- invalid_param = true;
- break;
- }
- sparams.chat_template = argv[i];
- } else if (arg == "--override-kv") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
- fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- server_print_usage(argv[0], default_params, default_sparams);
- exit(1);
- }
- }
-
- gpt_params_handle_model_default(params);
-
- if (!params.kv_overrides.empty()) {
- params.kv_overrides.emplace_back();
- params.kv_overrides.back().key[0] = 0;
- }
-
- if (invalid_param) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- server_print_usage(argv[0], default_params, default_sparams);
- exit(1);
- }
-}
-
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
// skip GH copilot requests when using default port
if (req.path == "/v1/health" || req.path == "/v1/completions") {
@@ -2929,16 +2452,22 @@ int main(int argc, char ** argv) {
log_disable();
#endif
// own arguments required by this example
- gpt_params params;
- server_params sparams;
+ gpt_params params;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
+ return 1;
+ }
+
+ // TODO: not great to use extern vars
+ server_log_json = params.log_json;
+ server_verbose = params.verbosity > 0;
// struct that contains llama context and inference
server_context ctx_server;
- server_params_parse(argc, argv, sparams, params);
-
- if (!sparams.system_prompt.empty()) {
- ctx_server.system_prompt_set(sparams.system_prompt);
+ if (!params.system_prompt.empty()) {
+ ctx_server.system_prompt_set(params.system_prompt);
}
if (params.model_alias == "unknown") {
@@ -2962,10 +2491,10 @@ int main(int argc, char ** argv) {
std::unique_ptr svr;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
- if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
- LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
+ if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+ LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
svr.reset(
- new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
+ new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
);
} else {
LOG_INFO("Running without SSL", {});
@@ -3019,26 +2548,29 @@ int main(int argc, char ** argv) {
});
// set timeouts and change hostname and port
- svr->set_read_timeout (sparams.read_timeout);
- svr->set_write_timeout(sparams.write_timeout);
+ svr->set_read_timeout (params.timeout_read);
+ svr->set_write_timeout(params.timeout_write);
- if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
- fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
+ if (!svr->bind_to_port(params.hostname, params.port)) {
+ fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
return 1;
}
std::unordered_map log_data;
- log_data["hostname"] = sparams.hostname;
- log_data["port"] = std::to_string(sparams.port);
+ log_data["hostname"] = params.hostname;
+ log_data["port"] = std::to_string(params.port);
- if (sparams.api_keys.size() == 1) {
- auto key = sparams.api_keys[0];
+ if (params.api_keys.size() == 1) {
+ auto key = params.api_keys[0];
log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
- } else if (sparams.api_keys.size() > 1) {
- log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
+ } else if (params.api_keys.size() > 1) {
+ log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
}
+ // Necessary similarity of prompt for slot selection
+ ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
+
// load the model
if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
@@ -3053,10 +2585,10 @@ int main(int argc, char ** argv) {
const auto model_meta = ctx_server.model_meta();
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
- if (sparams.chat_template.empty()) {
+ if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "chatml";
+ params.chat_template = "chatml";
}
}
@@ -3068,11 +2600,11 @@ int main(int argc, char ** argv) {
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
- const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+ const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
LOG_INFO("chat template", {
{"chat_example", chat_example},
- {"built_in", sparams.chat_template.empty()},
+ {"built_in", params.chat_template.empty()},
});
}
@@ -3080,7 +2612,7 @@ int main(int argc, char ** argv) {
// Middlewares
//
- auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
static const std::set protected_endpoints = {
"/props",
@@ -3098,7 +2630,7 @@ int main(int argc, char ** argv) {
};
// If API key is not set, skip validation
- if (sparams.api_keys.empty()) {
+ if (params.api_keys.empty()) {
return true;
}
@@ -3113,7 +2645,7 @@ int main(int argc, char ** argv) {
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
- if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+ if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
return true; // API key is valid
}
}
@@ -3168,7 +2700,7 @@ int main(int argc, char ** argv) {
};
res.status = 200; // HTTP OK
- if (sparams.slots_endpoint && req.has_param("include_slots")) {
+ if (params.endpoint_slots && req.has_param("include_slots")) {
health["slots"] = result.data.at("slots");
}
@@ -3194,7 +2726,7 @@ int main(int argc, char ** argv) {
};
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
- if (!sparams.slots_endpoint) {
+ if (!params.endpoint_slots) {
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@@ -3218,7 +2750,7 @@ int main(int argc, char ** argv) {
};
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
- if (!sparams.metrics_endpoint) {
+ if (!params.endpoint_metrics) {
res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@@ -3318,14 +2850,14 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
};
- const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+ const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data.at("filename");
if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
- std::string filepath = sparams.slot_save_path + filename;
+ std::string filepath = params.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
@@ -3348,14 +2880,14 @@ int main(int argc, char ** argv) {
}
};
- const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+ const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data.at("filename");
if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
- std::string filepath = sparams.slot_save_path + filename;
+ std::string filepath = params.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
@@ -3530,9 +3062,9 @@ int main(int argc, char ** argv) {
res.set_content(models.dump(), "application/json; charset=utf-8");
};
- const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
+ json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
const int id_task = ctx_server.queue_tasks.get_new_id();
@@ -3757,29 +3289,29 @@ int main(int argc, char ** argv) {
//
// register static assets routes
- if (!sparams.public_path.empty()) {
+ if (!params.public_path.empty()) {
// Set the base directory for serving static files
- svr->set_base_dir(sparams.public_path);
+ svr->set_base_dir(params.public_path);
}
+
// using embedded static files
- svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
- svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
- json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+ svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+ svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
// add new-ui files
- svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
- svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
+ svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
+ svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
- svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
- svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
+ svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
+ svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
+ svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
// register API routes
svr->Get ("/health", handle_health);
@@ -3798,7 +3330,7 @@ int main(int argc, char ** argv) {
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
- if (!sparams.slot_save_path.empty()) {
+ if (!params.slot_save_path.empty()) {
// only enable slot endpoints if slot_save_path is set
svr->Post("/slots/:id_slot", handle_slots_action);
}
@@ -3806,12 +3338,12 @@ int main(int argc, char ** argv) {
//
// Start the server
//
- if (sparams.n_threads_http < 1) {
+ if (params.n_threads_http < 1) {
// +2 threads for monitoring endpoints
- sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+ params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
}
- log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
- svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+ log_data["n_threads_http"] = std::to_string(params.n_threads_http);
+ svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
LOG_INFO("HTTP server listening", log_data);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d8a2286e4..63fde9c9f 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin
// chat template utils
//
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
- llama_chat_message chat[] = {{"user", "test"}};
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
- return res >= 0;
-}
-
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) {
size_t alloc_size = 0;
@@ -260,6 +253,13 @@ static size_t common_part(const std::vector & a, const std::vector<
return i;
}
+static size_t common_part(const std::string & a, const std::string & b) {
+ size_t i;
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+ return i;
+}
+
static bool ends_with(const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
}
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 5d24b1046..49e24501c 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
...
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index b0f8e0fdc..69a92cf7d 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,28 +6,27 @@
#include
#include
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
- return 1 ;
- }
+ params.prompt = "Hello my name is";
+ params.n_predict = 32;
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- params.prompt = argv[2];
- }
-
- if (params.prompt.empty()) {
- params.prompt = "Hello my name is";
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
// total length of the sequence including the prompt
- const int n_len = 32;
+ const int n_predict = params.n_predict;
// init LLM
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
-
- ctx_params.seed = 1234;
- ctx_params.n_ctx = 2048;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
- LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
+ LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
- while (n_cur <= n_len) {
+ while (n_cur <= n_predict) {
// sample the next token
{
auto n_vocab = llama_n_vocab(model);
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation?
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
LOG_TEE("\n");
break;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 12e46fbc9..0939a1a6a 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,7 +27,8 @@ struct seq_draft {
int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e2f85c682..b779f6bd4 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -302,7 +302,7 @@ static struct ggml_tensor * llama_build_train_graphs(
const int rope_mode = 0;
return ggml_rope_ext(
- ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+ ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
diff --git a/flake.lock b/flake.lock
index fd6e2a5f6..09047ab10 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
- "lastModified": 1715865404,
- "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
+ "lastModified": 1717285511,
+ "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
"owner": "hercules-ci",
"repo": "flake-parts",
- "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
+ "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
"type": "github"
},
"original": {
@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
- "lastModified": 1716509168,
- "narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
+ "lastModified": 1716948383,
+ "narHash": "sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs=",
"owner": "NixOS",
"repo": "nixpkgs",
- "rev": "bfb7a882678e518398ce9a31a881538679f6f092",
+ "rev": "ad57eef4ef0659193044870c731987a6df5cf56b",
"type": "github"
},
"original": {
@@ -36,14 +36,14 @@
},
"nixpkgs-lib": {
"locked": {
- "lastModified": 1714640452,
- "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
+ "lastModified": 1717284937,
+ "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
"type": "tarball",
- "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
+ "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
},
"original": {
"type": "tarball",
- "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
+ "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
}
},
"root": {
diff --git a/flake.nix b/flake.nix
index 9cd3756e5..0a52ea52e 100644
--- a/flake.nix
+++ b/flake.nix
@@ -159,7 +159,6 @@
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
}
// lib.optionalAttrs pkgs.stdenv.isLinux {
- opencl = config.packages.default.override { useOpenCL = true; };
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
mpi-cpu = config.packages.default.override { useMpi = true; };
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 1fbd376ed..73a3c1575 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL);
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
GGML_ASSERT(galloc->buffers != NULL);
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
// this tensor was allocated without ggml-backend
return;
}
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
+ ggml_backend_view_init(tensor);
}
} else {
if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
- ggml_backend_view_init(buffer, t);
+ ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
- ggml_backend_view_init(buffer, t);
+ ggml_backend_view_init(t);
}
}
}
diff --git a/ggml-backend.c b/ggml-backend.c
index 9e35ce98d..05737ed69 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (dst_buf->iface.cpy_tensor) {
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
}
return false;
}
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+void ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
- tensor->buffer = buffer;
+ tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
- ggml_backend_buffer_init_tensor(buffer, tensor);
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
}
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
- ggml_backend_view_init(dst->view_src->buffer, dst);
+ ggml_backend_view_init(dst);
}
else {
ggml_backend_tensor_copy(src, dst);
diff --git a/ggml-backend.h b/ggml-backend.h
index 744b6a774..c582b0685 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -225,7 +225,7 @@ extern "C" {
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
#ifdef __cplusplus
diff --git a/ggml-common.h b/ggml-common.h
index 77e6bfba4..e8efceb76 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
#define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8
+#define QI1_M (QK_K / (4*QR1_M))
+#define QR1_M 8
+
#define QI4_NL (QK4_NL / (4*QR4_NL))
#define QR4_NL 2
#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8
+#define QI3_S (QK_K / (4*QR3_S))
+#define QR3_S 8
+
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
#define QK4_0 32
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index daaa0cd6a..dad8a9e2d 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
// cuda split buffer
-static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) {
- int64_t min_compute_capability = INT_MAX;
- int64_t max_compute_capability = INT_MIN;
+static int64_t get_row_rounding(const std::array & tensor_split) {
+ int64_t row_rounding = 0;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
- min_compute_capability = ggml_cuda_info().devices[id].cc;
- }
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
- max_compute_capability = ggml_cuda_info().devices[id].cc;
- }
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+ continue;
}
- }
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- switch(type) {
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q5_0:
- case GGML_TYPE_Q5_1:
- case GGML_TYPE_Q8_0:
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
- case GGML_TYPE_F16:
- case GGML_TYPE_F32:
- return 1;
- case GGML_TYPE_Q2_K:
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
- case GGML_TYPE_Q3_K:
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
- case GGML_TYPE_Q4_K:
- case GGML_TYPE_Q5_K:
- case GGML_TYPE_Q6_K:
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_IQ4_NL:
- case GGML_TYPE_IQ4_XS:
- case GGML_TYPE_IQ3_S:
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
- default:
- GGML_ASSERT(false);
+ const int cc = ggml_cuda_info().devices[id].cc;
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
}
-#else
- switch(type) {
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
- case GGML_TYPE_Q5_0:
- case GGML_TYPE_Q5_1:
- case GGML_TYPE_Q8_0:
- return 64;
- case GGML_TYPE_F16:
- case GGML_TYPE_F32:
- return 1;
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_Q4_K:
- case GGML_TYPE_Q5_K:
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_IQ4_NL:
- case GGML_TYPE_IQ4_XS:
- case GGML_TYPE_IQ3_S:
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
- case GGML_TYPE_Q6_K:
- return 64;
- default:
- GGML_ASSERT(false);
- }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ return row_rounding;
}
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) {
const int64_t nrows = ggml_nrows(tensor);
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+ const int64_t rounding = get_row_rounding(tensor_split);
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
*row_low -= *row_low % rounding;
@@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat(
// for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes
if (split) {
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
+ const int64_t rounding = get_row_rounding(tensor_split);
if (id != 0) {
dev[id].row_low = ne01*tensor_split[id];
@@ -2702,10 +2636,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (cuda_graph_update_required) {
// Extract nodes from graph
- if (cuda_ctx->cuda_graph->num_nodes == 0) {
- // First call with null argument gets number of nodes in graph
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
- }
+ // First call with null argument gets number of nodes in graph
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
// Subsequent call with non-null argument gets nodes
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
index 22872ca5c..90a0a81ea 100644
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -160,7 +160,7 @@
#endif
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
-#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@@ -484,6 +484,161 @@ static __device__ __forceinline__ float get_alibi_slope(
return powf(base, exph);
}
+template
+struct ggml_cuda_type_traits;
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = 1;
+ static constexpr int qr = 1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_0;
+ static constexpr int qr = QR4_0;
+ static constexpr int qi = QI4_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_1;
+ static constexpr int qr = QR4_1;
+ static constexpr int qi = QI4_1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK5_0;
+ static constexpr int qr = QR5_0;
+ static constexpr int qi = QI5_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK5_1;
+ static constexpr int qr = QR5_1;
+ static constexpr int qi = QI5_1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK8_0;
+ static constexpr int qr = QR8_0;
+ static constexpr int qi = QI8_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_K;
+ static constexpr int qi = QI2_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_K;
+ static constexpr int qi = QI3_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR4_K;
+ static constexpr int qi = QI4_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR5_K;
+ static constexpr int qi = QI5_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR6_K;
+ static constexpr int qi = QI6_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_XXS;
+ static constexpr int qi = QI2_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_XS;
+ static constexpr int qi = QI2_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_S;
+ static constexpr int qi = QI2_S;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_XXS;
+ static constexpr int qi = QI3_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR1_S;
+ static constexpr int qi = QI1_S;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR1_M;
+ static constexpr int qi = QI1_M;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_NL;
+ static constexpr int qr = QR4_NL;
+ static constexpr int qi = QI4_NL;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR4_XS;
+ static constexpr int qi = QI4_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_S;
+ static constexpr int qi = QI3_S;
+};
+
+static int get_mmq_x_max_host(const int cc) {
+#ifdef CUDA_USE_TENSOR_CORES
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
+#else
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
+#endif // CUDA_USE_TENSOR_CORES
+}
+
+// Round rows to this value for --split-mode row:
+static int get_mmq_y_host(const int cc, const int mmq_x) {
+ return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+}
+
//////////////////////
struct ggml_cuda_device_info {
diff --git a/ggml-cuda/dmmv.cu b/ggml-cuda/dmmv.cu
index 47d4d5d9e..174489e06 100644
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@@ -422,10 +422,22 @@ static __device__ void convert_f16(const void * vx, const int64_t ib, const int
v.y = x[ib + iqs + 1];
}
-template
+static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
+ return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
+ type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
+ type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
+ type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
+ type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
+ type == GGML_TYPE_F16 ? convert_f16 :
+ nullptr;
+}
+
+template
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
- // qk = quantized weights per x block
- // qr = number of quantized weights per data value in x block
+ constexpr int qk = ggml_cuda_type_traits