diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 491d67676..01b3111d9 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip
+ apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 2e629f8ce..fc34a0c18 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential
+ apt-get install -y build-essential git
WORKDIR /app
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a5938bf93..c98cbcbbe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
push:
branches:
- master
- paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+ paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, reopened]
- paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+ paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -151,21 +151,21 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
- CLBLAST_VERSION: 1.5.3
+ CLBLAST_VERSION: 1.6.0
strategy:
matrix:
include:
- build: 'avx2'
- defines: ''
+ defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
- defines: '-DLLAMA_AVX2=OFF'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
- defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
- defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
- defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps:
- name: Clone
@@ -184,13 +184,13 @@ jobs:
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
- curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
+ curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
- mkdir $env:RUNNER_TEMP/clblast
- tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
+ 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+ rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
- $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+ $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
@@ -213,7 +213,6 @@ jobs:
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- cp ../LICENSE ./bin/Release/llama.cpp.txt
- name: Add clblast.dll
id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
+ Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -292,7 +292,7 @@ jobs:
run: |
mkdir build
cd build
- cmake .. -DLLAMA_CUBLAS=ON
+ cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release
- name: Get commit hash
diff --git a/BLIS.md b/BLIS.md
new file mode 100644
index 000000000..9b3c30605
--- /dev/null
+++ b/BLIS.md
@@ -0,0 +1,67 @@
+BLIS Installation Manual
+------------------------
+
+BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
+
+Project URL: https://github.com/flame/blis
+
+### Prepare:
+
+Compile BLIS:
+
+```bash
+git clone https://github.com/flame/blis
+cd blis
+./configure --enable-cblas -t openmp,pthreads auto
+# will install to /usr/local/ by default.
+make -j
+```
+
+Install BLIS:
+
+```bash
+sudo make install
+```
+
+We recommend using openmp since it's easier to modify the cores been used.
+
+### llama.cpp compilation
+
+Makefile:
+
+```bash
+make LLAMA_BLIS=1 -j
+# make LLAMA_BLIS=1 benchmark-matmult
+```
+
+CMake:
+
+```bash
+mkdir build
+cd build
+cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+make -j
+```
+
+### llama.cpp execution
+
+According to the BLIS documentation, we could set the following
+environment variables to modify the behavior of openmp:
+
+```
+export GOMP_GPU_AFFINITY="0-19"
+export BLIS_NUM_THREADS=14
+```
+
+And then run the binaries as normal.
+
+
+### Intel specific issue
+
+Some might get the error message saying that `libimf.so` cannot be found.
+Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
+
+### Reference:
+
+1. https://github.com/flame/blis#getting-started
+2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48e3238df..21f4ec9dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,40 +37,44 @@ endif()
#
# general
-option(LLAMA_STATIC "llama: static link libraries" OFF)
-option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
-option(LLAMA_LTO "llama: enable link time optimization" OFF)
+option(LLAMA_STATIC "llama: static link libraries" OFF)
+option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
+option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug
-option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF "llama: enable gprof" OFF)
+option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(LLAMA_GPROF "llama: enable gprof" OFF)
# sanitizers
-option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
-option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
-option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
+option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
+option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# instruction set specific
-option(LLAMA_AVX "llama: enable AVX" ON)
-option(LLAMA_AVX2 "llama: enable AVX2" ON)
-option(LLAMA_AVX512 "llama: enable AVX512" OFF)
-option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
-option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
-option(LLAMA_FMA "llama: enable FMA" ON)
+option(LLAMA_AVX "llama: enable AVX" ON)
+option(LLAMA_AVX2 "llama: enable AVX2" ON)
+option(LLAMA_AVX512 "llama: enable AVX512" OFF)
+option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
+option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
+option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
- option(LLAMA_F16C "llama: enable F16C" ON)
+ option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs
-option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
-option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
-option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
-option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
+option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
+option(LLAMA_BLAS "llama: use BLAS" OFF)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
+set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
-option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
#
# Build info header
@@ -145,36 +149,28 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()
-if (LLAMA_OPENBLAS)
+if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
-
- set(BLA_VENDOR OpenBLAS)
+ if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
+ set(BLA_SIZEOF_INTEGER 8)
+ endif()
+ set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
- message(STATUS "OpenBLAS found")
+ message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+ add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
- add_link_options(${BLAS_LIBRARIES})
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
- # find header file
- set(OPENBLAS_INCLUDE_SEARCH_PATHS
- /usr/include
- /usr/include/openblas
- /usr/include/openblas-base
- /usr/local/include
- /usr/local/include/openblas
- /usr/local/include/openblas-base
- /opt/OpenBLAS/include
- $ENV{OpenBLAS_HOME}
- $ENV{OpenBLAS_HOME}/include
- )
- find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
- add_compile_options(-I${OPENBLAS_INC})
+ message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
+ include_directories(${BLAS_INCLUDE_DIRS})
else()
- message(WARNING "OpenBLAS not found")
+ message(WARNING "BLAS not found, please refer to "
+ "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+ " to set correct LLAMA_BLAS_VENDOR")
endif()
endif()
@@ -190,6 +186,8 @@ if (LLAMA_CUBLAS)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+ add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -207,7 +205,7 @@ if (LLAMA_CLBLAST)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
- set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+ set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
add_compile_definitions(GGML_USE_CLBLAST)
diff --git a/Makefile b/Makefile
index 0ddff9961..8e8d426c5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
# Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+
+ifdef LLAMA_BUILD_SERVER
+ BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)
ifndef UNAME_S
UNAME_S := $(shell uname -s)
@@ -38,7 +44,11 @@ CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
LDFLAGS =
-ifndef LLAMA_DEBUG
+ifdef LLAMA_DEBUG
+ CFLAGS += -O0 -g
+ CXXFLAGS += -O0 -g
+ LDFLAGS += -g
+else
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif
@@ -74,6 +84,15 @@ ifeq ($(UNAME_S),Haiku)
CXXFLAGS += -pthread
endif
+ifdef LLAMA_GPROF
+ CFLAGS += -pg
+ CXXFLAGS += -pg
+endif
+ifdef LLAMA_PERF
+ CFLAGS += -DGGML_PERF
+ CXXFLAGS += -DGGML_PERF
+endif
+
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
@@ -106,13 +125,17 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif
ifdef LLAMA_OPENBLAS
- CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+ CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
LDFLAGS += -lopenblas -lcblas
else
LDFLAGS += -lopenblas
endif
endif
+ifdef LLAMA_BLIS
+ CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+ LDFLAGS += -lblis -L/usr/local/lib
+endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@@ -120,11 +143,22 @@ ifdef LLAMA_CUBLAS
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ifdef LLAMA_CUDA_DMMV_X
+ NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+else
+ NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # LLAMA_CUDA_DMMV_X
+ifdef LLAMA_CUDA_DMMV_Y
+ NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+else
+ NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+endif # LLAMA_CUDA_DMMV_Y
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif
+endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
+ CXXFLAGS += -DGGML_USE_CLBLAST
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
@@ -132,16 +166,8 @@ ifdef LLAMA_CLBLAST
LDFLAGS += -lclblast -lOpenCL
endif
OBJS += ggml-opencl.o
-ggml-opencl.o: ggml-opencl.c ggml-opencl.h
- $(CC) $(CFLAGS) -c $< -o $@
-endif
-ifdef LLAMA_GPROF
- CFLAGS += -pg
- CXXFLAGS += -pg
-endif
-ifdef LLAMA_PERF
- CFLAGS += -DGGML_PERF
- CXXFLAGS += -DGGML_PERF
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
@@ -194,7 +220,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
- rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+ rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
#
# Examples
@@ -221,6 +247,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+ $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
@@ -240,6 +269,6 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-.PHONY: tests
+.PHONY: tests clean
tests:
bash ./tests/run-tests.sh
diff --git a/README.md b/README.md
index 1d84a5e6d..00571d8e1 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
+- Quantization formats `Q4` and `Q8` have changed again (19 May) - [(info)](https://github.com/ggerganov/llama.cpp/pull/1508)
- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
@@ -55,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- Mixed F16 / F32 precision
- 4-bit, 5-bit and 8-bit integer quantization support
- Runs on the CPU
-- OpenBLAS support
+- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
- cuBLAS and CLBlast support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
@@ -80,6 +81,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
**Bindings:**
@@ -238,11 +240,11 @@ In order to build llama.cpp you have three different options.
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
-- Accelerate Framework:
+- **Accelerate Framework**:
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-- OpenBLAS:
+- **OpenBLAS**:
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
@@ -272,11 +274,26 @@ Building the program with BLAS support may lead to some performance improvements
```bash
mkdir build
cd build
- cmake .. -DLLAMA_OPENBLAS=ON
+ cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
```
-- cuBLAS
+- **BLIS**
+
+ Check [BLIS.md](BLIS.md) for more information.
+
+- **Intel MKL**
+
+ By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+
+ ```bash
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+ cmake --build . -config Release
+ ```
+
+- **cuBLAS**
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
- Using `make`:
@@ -291,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
+ Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
-Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+- **CLBlast**
+
+ OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+ You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+ - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+
+ -
+ Installing the OpenCL SDK from source
+
+ ```sh
+ git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+ mkdir OpenCL-SDK/build
+ cd OpenCL-SDK/build
+ cmake .. -DBUILD_DOCS=OFF \
+ -DBUILD_EXAMPLES=OFF \
+ -DBUILD_TESTING=OFF \
+ -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+ -DOPENCL_SDK_TEST_SAMPLES=OFF
+ cmake --build . --config Release
+ cmake --install . --prefix /some/path
+ ```
+
+
+ Installing CLBlast: it may be found in your operating system's packages.
+
+ -
+ If not, then installing from source:
+
+ ```sh
+ git clone https://github.com/CNugteren/CLBlast.git
+ mkdir CLBlast/build
+ cd CLBLast/build
+ cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+ cmake --build . --config Release
+ cmake --install . --prefix /some/path
+ ```
+
+ Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
+
+
+ Building:
+
+ - Build with make:
+ ```sh
+ make LLAMA_CLBLAST=1
+ ```
+ - CMake:
+ ```sh
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+ cmake --build . --config Release
+ ```
+
+ Running:
+
+ The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
+
+ To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+ The selection can be a number (starting from 0) or a text string to search:
+
+ ```sh
+ GGML_OPENCL_PLATFORM=1 ./main ...
+ GGML_OPENCL_DEVICE=2 ./main ...
+ GGML_OPENCL_PLATFORM=Intel ./main ...
+ GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+ ```
+
+ The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+ Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+ You can get a list of platforms and devices from the `clinfo -l` command, etc.
### Prepare Data & Run
@@ -333,16 +423,16 @@ Several quantization methods are supported. They differ in the resulting model d
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-| 7B | perplexity | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
-| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
-| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 |
-| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 |
-| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
-| 13B | perplexity | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
-| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
-| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 |
-| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 |
-| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
+| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
+| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
+| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
+| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
+| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
+| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
+| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
+| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
+| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
+| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
### Perplexity (measuring model quality)
@@ -374,6 +464,25 @@ Note the use of `--color` to distinguish between user input and generated text.

+### Persistent Interaction
+
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+
+```bash
+# Start a new chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Resume that chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Start a different chat with the same prompt/model
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+
+# Different prompt cache for different prompt/model
+PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
+ CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
+```
+
### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder
diff --git a/convert.py b/convert.py
index 8f4f0399e..ece5a0266 100644
--- a/convert.py
+++ b/convert.py
@@ -121,7 +121,6 @@ def make_tensors_list() -> List[str]:
f'layers.{i}.feed_forward.w1.weight',
f'layers.{i}.feed_forward.w2.weight',
f'layers.{i}.feed_forward.w3.weight',
- f'layers.{i}.atttention_norm.weight',
f'layers.{i}.ffn_norm.weight',
]
return ret
@@ -1055,7 +1054,7 @@ def load_some_model(path: Path) -> ModelPlus:
files = list(path.glob("model-00001-of-*.safetensors"))
if not files:
# Try the PyTorch patterns too, with lower priority
- globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+ globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try GGML too, but with lower priority, since if both a non-GGML
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 74d0350d8..e4ce5aca7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,4 +37,7 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
+ if(LLAMA_BUILD_SERVER)
+ add_subdirectory(server)
+ endif()
endif()
diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp
index 03f93c749..aa3b23789 100644
--- a/examples/baby-llama/baby-llama-text.cpp
+++ b/examples/baby-llama/baby-llama-text.cpp
@@ -2104,7 +2104,7 @@ struct train_params {
int n_batch;
int n_examples;
int n_predict;
-
+
int print_info_interval;
int print_details_interval;
@@ -2148,7 +2148,7 @@ struct train_params get_default_train_params() {
params.n_batch = 8;
params.n_examples = 8;
params.n_predict = 1024;
-
+
params.print_info_interval = 1;
params.print_details_interval = 2;
@@ -2621,9 +2621,9 @@ int main(int argc, char ** argv) {
opt->params.adam.sched = (opt->iter < params.warmup)
? (float) opt->iter / (float) params.warmup
: cosine_decay_restart(
- params.cos_decay_steps,
- params.cos_decay_alpha,
- opt->iter - params.warmup,
+ params.cos_decay_steps,
+ params.cos_decay_alpha,
+ opt->iter - params.warmup,
params.cos_decay_restart);
printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 6117ae3ab..9f9ed9db0 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,6 +1,7 @@
-#include
#include "ggml.h"
#include "build-info.h"
+
+#include
#include
#include
#include
@@ -15,7 +16,7 @@
#include
#include
-float tensor_sum_elements(struct ggml_tensor * tensor) {
+float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
@@ -27,21 +28,15 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
return sum;
}
+void tensor_dump(const ggml_tensor * tensor, const char * name) {
+ printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
+ tensor->type, ggml_type_name(tensor->type),
+ (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
+ float sum = tensor_sum_elements(tensor);
+ printf("Sum of tensor %s is %6.2f\n", name, sum);
+}
-/*
- These are mapping to unknown
- GGML_TYPE_I8,
- GGML_TYPE_I16,
- GGML_TYPE_I32,
- GGML_TYPE_COUNT,
-*/
-
-#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
-
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
- TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
- (int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
- { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
int32_t n_threads = 1;
@@ -59,8 +54,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
}
int main(int argc, char ** argv) {
-
-
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
@@ -84,11 +77,11 @@ int main(int argc, char ** argv) {
print_usage(argc, argv, benchmark_params);
exit(0);
}
- if (invalid_param) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- print_usage(argc, argv, benchmark_params);
- exit(1);
- }
+ }
+ if (invalid_param) {
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+ print_usage(argc, argv, benchmark_params);
+ exit(1);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -216,10 +209,10 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the q4_0 multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+ printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
+ printf("=====================================================================================\n");
- printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
- printf("==============================================================================================\n");
-
+ double gflops_sum = 0;
for (int i=0;i&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
+ exit 1
+fi
+
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
+USER_NAME="${USER_NAME:-User}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+DATE_TIME="$(date +%H:%M)"
+DATE_YEAR="$(date +%Y)"
+
+LOG="${CHAT_SAVE_DIR}/main.log"
+LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
+CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
+CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
+NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
+NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
+
+SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
+
+CTX_SIZE=2048
+CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+
+# An unbuffered `tail -c+N`
+skip_bytes() {
+ LANG=C IFS= read -r -n "$1" -d '' c
+ while LANG=C IFS= read -r -n 1 -d '' c; do
+ printf '%s' "$c"
+ done
+}
+
+mkdir -p "$CHAT_SAVE_DIR"
+echo >"$LOG"
+trap "tail -n100 ${LOG}" EXIT
+
+if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
+ sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
+ -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
+ -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
+ -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
+ "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
+fi
+
+if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
+ sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+fi
+
+if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
+ echo '...' >>"$NEXT_PROMPT_FILE"
+fi
+
+if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
+ echo 'Prompt cache does not exist, building...'
+ # Default batch_size to 8 here for better user feedback during initial prompt processing
+ ./main 2>>"$LOG" \
+ --batch_size 8 \
+ "${OPTS[@]}" \
+ --prompt-cache "$PROMPT_CACHE_FILE" \
+ --file "$CUR_PROMPT_FILE" \
+ --n_predict 1
+ echo
+ echo 'Done!'
+fi
+
+if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
+ cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
+fi
+if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
+ cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+fi
+
+printf '%s ' "$(< "$CUR_PROMPT_FILE")"
+n_tokens=0
+
+while read -e line; do
+ # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
+ n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+
+ # Swap prompts when we're about to run out of context
+ if ((n_predict <= 0)); then
+ wait # for background main (below) to finish with next prompt
+ mv "$NEXT_PROMPT_FILE" "$CUR_PROMPT_FILE"
+ mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
+
+ sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+ echo '...' >>"$NEXT_PROMPT_FILE"
+ cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+
+ n_tokens=0
+ n_predict=$((CTX_SIZE / 2))
+ fi
+
+ echo " ${line}" >>"$CUR_PROMPT_FILE"
+ if ((n_tokens > CTX_ROTATE_POINT)); then
+ echo " ${line}" >>"$NEXT_PROMPT_FILE"
+ fi
+
+ n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
+
+ printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
+
+ ./main 2>>"$LOG" "${OPTS[@]}" \
+ --prompt-cache "$CUR_PROMPT_CACHE" \
+ --prompt-cache-all \
+ --file "$CUR_PROMPT_FILE" \
+ --reverse-prompt "${USER_NAME}:" \
+ --n_predict "$n_predict" |
+ skip_bytes 1 | # skip BOS token added by ./main
+ tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
+ skip_bytes "$n_prompt_len_pre" # print generation
+
+ mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
+
+ # if we hit n_predict instead of reverse-prompt, we need to add the prompt
+ if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
+ printf '\n%s:' "$USER_NAME"
+ printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
+ fi
+
+ printf ' '
+
+ # HACK get num tokens from debug message
+ # TODO get both messages in one go
+ if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
+ ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+ echo >&2 "Couldn't get number of tokens from ./main output!"
+ exit 1
+ fi
+
+ n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+
+ if ((n_tokens > CTX_ROTATE_POINT)); then
+ tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
+ fi
+
+ # Update cache for next prompt in background, ideally during user input
+ ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+ --prompt-cache "$NEXT_PROMPT_CACHE" \
+ --file "$NEXT_PROMPT_FILE" \
+ --n_predict 1 &
+done
diff --git a/examples/common.cpp b/examples/common.cpp
index 86c1eef41..32247cef7 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -8,6 +8,7 @@
#include
#include
#include
+#include
#if defined(__APPLE__) && defined(__MACH__)
#include
@@ -28,21 +29,21 @@
int32_t get_num_physical_cores() {
#ifdef __linux__
- std::ifstream cpuinfo("/proc/cpuinfo");
- std::string line;
- while (std::getline(cpuinfo, line)) {
- std::size_t pos = line.find("cpu cores");
- if (pos != std::string::npos) {
- pos = line.find(": ", pos);
- if (pos != std::string::npos) {
- try {
- // Extract the number and return it
- return static_cast(std::stoul(line.substr(pos + 2)));
- } catch (const std::invalid_argument &) {
- // Ignore if we could not parse
- }
- }
+ // enumerate the set of thread siblings, num entries is num cores
+ std::unordered_set siblings;
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+ std::ifstream thread_siblings("/sys/devices/system/cpu"
+ + std::to_string(cpu) + "/topology/thread_siblings");
+ if (!thread_siblings.is_open()) {
+ break; // no more cpus
}
+ std::string line;
+ if (std::getline(thread_siblings, line)) {
+ siblings.insert(line);
+ }
+ }
+ if (siblings.size() > 0) {
+ return static_cast(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
@@ -250,6 +251,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
+ } else if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.model_alias = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
@@ -282,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
+#else
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
@@ -320,12 +332,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
- } else if (arg == "--n-parts") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_parts = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params);
exit(0);
@@ -356,7 +362,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
- params.instruct || params.antiprompt.size())) {
+ params.instruct)) {
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
@@ -378,8 +384,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
- fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
- fprintf(stderr, " specified more than once for multiple prompts).\n");
+ fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
+ fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
@@ -415,9 +421,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
- fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
+ fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
+ fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
- fprintf(stderr, " --n-parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
@@ -427,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
+#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
@@ -472,7 +480,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
- lparams.n_parts = params.n_parts;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
@@ -585,6 +592,37 @@ void console_set_color(console_state & con_st, console_color_t color) {
}
char32_t getchar32() {
+#if defined(_WIN32)
+ HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+ wchar_t high_surrogate = 0;
+
+ while (true) {
+ INPUT_RECORD record;
+ DWORD count;
+ if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+ return WEOF;
+ }
+
+ if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+ wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+ if (wc == 0) {
+ continue;
+ }
+
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+ high_surrogate = wc;
+ continue;
+ } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+ if (high_surrogate != 0) { // Check if we have a high surrogate
+ return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+ }
+ }
+
+ high_surrogate = 0; // Reset the high surrogate
+ return static_cast(wc);
+ }
+ }
+#else
wchar_t wc = getwchar();
if (static_cast(wc) == WEOF) {
return WEOF;
@@ -603,6 +641,7 @@ char32_t getchar32() {
#endif
return static_cast(wc);
+#endif
}
void pop_cursor(console_state & con_st) {
@@ -756,7 +795,7 @@ bool console_readline(console_state & con_st, std::string & line) {
break;
}
- if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
+ if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
end_of_stream = true;
break;
}
@@ -771,7 +810,7 @@ bool console_readline(console_state & con_st, std::string & line) {
char32_t code = getchar32();
if (code == '[' || code == 0x1B) {
// Discard the rest of the escape sequence
- while ((code = getchar32()) != WEOF) {
+ while ((code = getchar32()) != (char32_t) WEOF) {
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
break;
}
diff --git a/examples/common.h b/examples/common.h
index 717838f06..fea9aa81a 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -24,7 +24,6 @@ struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
- int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -45,15 +44,16 @@ struct gpt_params {
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
- std::string model = "models/lamma-7B/ggml-model.bin"; // model path
- std::string prompt = "";
+ std::string model = "models/7B/ggml-model.bin"; // model path
+ std::string model_alias = "unknown"; // model alias
+ std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
- std::string lora_base = ""; // base model path for the lora adapter
+ std::string lora_base = ""; // base model path for the lora adapter
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index bb3fd50a9..03603b10f 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -6,7 +6,6 @@
int main(int argc, char ** argv) {
gpt_params params;
- params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
@@ -32,6 +31,8 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
+ llama_init_backend();
+
llama_context * ctx;
// load the model
diff --git a/examples/main/README.md b/examples/main/README.md
index 7c03f92c8..dd0874977 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,8 +69,8 @@ In this section, we cover the most commonly used options for running the `main`
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
-- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts
@@ -136,9 +136,9 @@ During text generation, LLaMA models have a limited context size, which means th
### Context Size
-The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
-- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Keep Prompt
@@ -146,7 +146,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
-By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
## Generation Flags
@@ -154,11 +154,11 @@ The following options allow you to control the text generation process and fine-
### Number of Tokens to Predict
-- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
-The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature
@@ -170,33 +170,33 @@ Example usage: `--temp 0.5`
### Repeat Penalty
-- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-- `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
+- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
-The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
-The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
-Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
### Top-K Sampling
-- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
-Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
-Example usage: `--top_k 30`
+Example usage: `--top-k 30`
### Top-P Sampling
-- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
-Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
-Example usage: `--top_p 0.95`
+Example usage: `--top-p 0.95`
### Tail Free Sampling (TFS)
@@ -217,16 +217,16 @@ Example usage: `--typical 0.9`
### Mirostat Sampling
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
-- `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
-- `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
-The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
-The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
-Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
### Logit Bias
@@ -264,15 +264,15 @@ These options help improve the performance and memory usage of the LLaMA models.
### Memory Float 32
-- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
### Batch Size
-- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
### Prompt Caching
-- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Quantization
@@ -285,5 +285,6 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 8543414dd..6131f5b46 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -50,7 +50,6 @@ void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
- params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
@@ -97,8 +96,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
-// params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
+ llama_init_backend();
llama_context * ctx;
g_ctx = &ctx;
@@ -136,8 +134,6 @@ int main(int argc, char ** argv) {
return 0;
}
- // Add a space in front of the first character to match OG llama tokenizer behavior
- params.prompt.insert(0, 1, ' ');
std::string path_session = params.path_prompt_cache;
std::vector session_tokens;
@@ -157,6 +153,7 @@ int main(int argc, char ** argv) {
return 1;
}
session_tokens.resize(n_token_count_out);
+ llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
@@ -165,7 +162,16 @@ int main(int argc, char ** argv) {
}
// tokenize the prompt
- auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+ std::vector embd_inp;
+
+ if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+
+ embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+ } else {
+ embd_inp = session_tokens;
+ }
const int n_ctx = llama_n_ctx(ctx);
@@ -183,7 +189,9 @@ int main(int argc, char ** argv) {
}
n_matching_session_tokens++;
}
- if (n_matching_session_tokens >= embd_inp.size()) {
+ if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+ fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+ } else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
@@ -209,8 +217,8 @@ int main(int argc, char ** argv) {
params.antiprompt.push_back("### Instruction:\n\n");
}
- // enable interactive mode if reverse prompt or interactive start is specified
- if (params.antiprompt.size() != 0 || params.interactive_first) {
+ // enable interactive mode if interactive start is specified
+ if (params.interactive_first) {
params.interactive = true;
}
@@ -242,7 +250,7 @@ int main(int argc, char ** argv) {
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
- auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(static_cast(console_ctrl_handler), true);
@@ -306,7 +314,7 @@ int main(int argc, char ** argv) {
std::vector embd;
- while (n_remain != 0 || params.interactive) {
+ while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
@@ -352,6 +360,12 @@ int main(int argc, char ** argv) {
}
}
if (i > 0) {
+ // check if we've used up all the prompt but not all cached tokens
+ if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) {
+ // force revaluation of the last token to recalculate logits
+ i--;
+ n_past--;
+ }
embd.erase(embd.begin(), embd.begin() + i);
}
}
@@ -504,9 +518,8 @@ int main(int argc, char ** argv) {
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
}
- // in interactive mode, and not currently processing queued inputs;
- // check if we should prompt the user for more
- if (params.interactive && (int) embd_inp.size() <= n_consumed) {
+ // if not currently processing queued inputs;
+ if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
@@ -517,10 +530,21 @@ int main(int argc, char ** argv) {
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
+ // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+ // so we'll compensate for that by widening the search window a bit.
for (std::string & antiprompt : params.antiprompt) {
- if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
- is_interacting = true;
+ size_t extra_padding = params.interactive ? 0 : 2;
+ size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding)
+ ? last_output.length() - static_cast(antiprompt.length() + extra_padding)
+ : 0;
+
+ if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+ if (params.interactive) {
+ is_interacting = true;
+ console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+ }
is_antiprompt = true;
+ fflush(stdout);
break;
}
}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9212dee5c..e19c6825f 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -116,7 +116,6 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
- params.model = "models/llama-7B/ggml-model.bin";
params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) {
@@ -144,6 +143,8 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
+ llama_init_backend();
+
llama_context * ctx;
// load the model and apply lora adapter, if any
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 9a2aa7c64..085fdde3c 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
auto lparams = llama_context_default_params();
lparams.n_ctx = 256;
- lparams.n_parts = 1;
lparams.seed = 1;
lparams.f16_kv = false;
lparams.use_mlock = false;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 115d8fb1b..769dd36a4 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,7 +1,7 @@
-#include "ggml.h"
-#include "llama.h"
#include "build-info.h"
+#include "llama.h"
+
#include
#include