diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 945df42f8..0e7643bba 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,14 +15,133 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
+ macOS-latest-cmake-arm64:
+ runs-on: macos-14
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Dependencies
+ id: depends
+ continue-on-error: true
+ run: |
+ brew update
+
+ - name: Build
+ id: cmake_build
+ run: |
+ sysctl -a
+ mkdir build
+ cd build
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+
+ macOS-latest-cmake-x64:
+ runs-on: macos-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Dependencies
+ id: depends
+ continue-on-error: true
+ run: |
+ brew update
+
+ - name: Build
+ id: cmake_build
+ run: |
+ sysctl -a
+ mkdir build
+ cd build
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+
ubuntu-focal-make:
runs-on: ubuntu-20.04
+ env:
+ LLAMA_NODE_AVAILABLE: true
+ LLAMA_PYTHON_AVAILABLE: true
steps:
- name: Clone
@@ -35,6 +154,14 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential gcc-8
+ - uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - uses: actions/setup-python@v4
+ with:
+ python-version: "3.11"
+
- name: Build
id: make_build
env:
@@ -98,40 +225,51 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
- ubuntu-latest-cmake-sanitizer:
- runs-on: ubuntu-latest
-
- continue-on-error: true
-
- strategy:
- matrix:
- sanitizer: [ADDRESS, THREAD, UNDEFINED]
- build_type: [Debug, Release]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
- cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
- - name: Test
- id: cmake_test
+ - name: Test llama2c conversion
+ id: llama2c_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ echo "Fetch tokenizer"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+ echo "Fetch llama2c model"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+ ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+ ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+# ubuntu-latest-cmake-sanitizer:
+# runs-on: ubuntu-latest
+#
+# continue-on-error: true
+#
+# strategy:
+# matrix:
+# sanitizer: [ADDRESS, THREAD, UNDEFINED]
+# build_type: [Debug, Release]
+#
+# steps:
+# - name: Clone
+# id: checkout
+# uses: actions/checkout@v3
+#
+# - name: Dependencies
+# id: depends
+# run: |
+# sudo apt-get update
+# sudo apt-get install build-essential
+#
+# - name: Build
+# id: cmake_build
+# run: |
+# mkdir build
+# cd build
+# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+#
+# - name: Test
+# id: cmake_test
+# run: |
+# cd build
+# ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest
@@ -662,6 +800,7 @@ jobs:
windows-latest-cmake-sycl:
runs-on: windows-latest
+
defaults:
run:
shell: bash
@@ -670,7 +809,6 @@ jobs:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
steps:
- name: Clone
id: checkout
@@ -685,6 +823,32 @@ jobs:
id: cmake_build
run: examples/sycl/win-build-sycl.bat
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip .\build\bin\*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+
ios-xcode-build:
runs-on: macos-latest
@@ -748,6 +912,8 @@ jobs:
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
+ - macOS-latest-cmake-arm64
+ - macOS-latest-cmake-x64
steps:
- name: Clone
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index 2682f308c..7f21daec0 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -12,12 +12,12 @@ jobs:
steps:
- uses: actions/stale@v5
with:
+ exempt-issue-labels: "refactor,help wanted,good first issue,research"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
- stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
- operations-per-run: 1000
+ operations-per-run: 10000
repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
index 392db8a08..4112518bb 100644
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -5,6 +5,10 @@ env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
run:
runs-on: ubuntu-20.04
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 94f9161fc..9591bfc2a 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,6 +15,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index 0e0993cd4..7b2a00c90 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,6 +14,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
editorconfig:
runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index 8d0a3fd7f..109a793ea 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -17,6 +17,10 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 01c5a9d5a..8b5b99c8f 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -8,6 +8,10 @@ on:
pull_request:
types: [opened, synchronize, reopened]
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
nix-eval:
strategy:
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index b82205992..4092b12fa 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -16,6 +16,10 @@ on:
- 'requirements.txt'
- 'requirements/*.txt'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
python-check-requirements:
runs-on: ubuntu-latest
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index ea0a05ea1..4bdd79c4a 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -2,6 +2,10 @@ name: flake8 Lint
on: [push, pull_request]
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
flake8-lint:
runs-on: ubuntu-latest
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 4ea09115a..f07d25536 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -18,20 +18,23 @@ on:
schedule:
- cron: '0 0 * * *'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
server:
runs-on: ubuntu-latest
strategy:
matrix:
- sanitizer: [ADDRESS, THREAD, UNDEFINED]
+ # TODO: temporary disabled due to linux kernel issues
+ #sanitizer: [ADDRESS, THREAD, UNDEFINED]
+ sanitizer: [UNDEFINED]
build_type: [Debug]
include:
- build_type: Release
sanitizer: ""
- - build_type: Debug
- sanitizer: THREAD
- disabled_on_pr: true
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
index 68a698ab9..cb43954eb 100644
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -6,6 +6,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
build:
strategy:
diff --git a/.gitignore b/.gitignore
index 1ad8d929b..9fb5b80c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,10 @@
*.gcda
*.dot
*.bat
+*.tmp
*.metallib
+*.etag
+*.lastModified
.DS_Store
.build/
.cache/
@@ -47,6 +50,7 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
+/gguf-split
/gritlm
/imatrix
/infill
@@ -55,6 +59,9 @@ models-mnt
/llava-cli
/lookahead
/lookup
+/lookup-create
+/lookup-merge
+/lookup-stats
/main
/metal
/passkey
@@ -70,6 +77,7 @@ models-mnt
/batched-bench
/export-lora
/finetune
+/retrieval
/speculative
/parallel
/train-text-from-scratch
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc4cff28f..3333ee1c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
+option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -387,6 +388,9 @@ if (LLAMA_CUBLAS)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+ if (LLAMA_CUDA_NO_PEER_COPY)
+ add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+ endif()
if (LLAMA_STATIC)
if (WIN32)
@@ -531,6 +535,10 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
+ if (LLAMA_CUDA_NO_PEER_COPY)
+ add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+ endif()
+
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
diff --git a/Makefile b/Makefile
index 838daf5c0..130fde838 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,16 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
+ tests/test-json-schema-to-grammar
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -451,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
else
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
+ifdef LLAMA_CUDA_NO_PEER_COPY
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
@@ -534,6 +535,9 @@ endif # LLAMA_HIP_UMA
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
+ifdef LLAMA_CUDA_NO_PEER_COPY
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
@@ -666,9 +670,15 @@ console.o: common/console.cpp common/console.h
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
train.o: common/train.cpp common/train.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@@ -676,7 +686,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
find examples pocs -type f -name "*.o" -delete
#
@@ -745,7 +755,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -753,6 +763,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -790,6 +804,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -802,9 +820,15 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -861,6 +885,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README-sycl.md b/README-sycl.md
index 9359a9490..cbf14f2da 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,6 +29,8 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## News
- 2024.3
+ - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
+ - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -81,7 +83,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec
|-|-|-|
|Ampere Series| Support| A100|
-### oneMKL
+### oneMKL for CUDA
The current oneMKL release does not contain the oneMKL cuBlas backend.
As a result for Nvidia GPU's oneMKL must be built from source.
@@ -114,7 +116,7 @@ You can choose between **F16** and **F32** build. F16 is faster for long-prompt
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
-# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
+# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
```
### Run
@@ -254,16 +256,16 @@ Run without parameter:
Check the ID in startup log, like:
```
-found 4 SYCL devices:
- Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
- Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
- max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
- Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
- max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
- Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
-
+found 6 SYCL devices:
+| | | |Compute |Max compute|Max work|Max sub| |
+|ID| Device Type| Name|capability|units |group |group |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
+| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
+| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
+| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
+| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
+| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
|Attribute|Note|
@@ -271,12 +273,35 @@ found 4 SYCL devices:
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-4. Set device ID and execute llama.cpp
+4. Device selection and execution of llama.cpp
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:
```sh
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+```
+or run by script:
+
+```sh
+./examples/sycl/run_llama2.sh 0
+```
+
+- Use multiple devices:
+
+```sh
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
or run by script:
@@ -289,12 +314,18 @@ Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-5. Check the device ID in output
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
-Like:
```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
+```
+
## Windows
@@ -355,7 +386,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install mingw-w64 make for Windows provided by w64devkit
-- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
- Extract `w64devkit` on your pc.
@@ -430,15 +461,16 @@ build\bin\main.exe
Check the ID in startup log, like:
```
-found 4 SYCL devices:
- Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
- Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
- max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
- Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
- max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
- Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+found 6 SYCL devices:
+| | | |Compute |Max compute|Max work|Max sub| |
+|ID| Device Type| Name|capability|units |group |group |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
+| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
+| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
+| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
+| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
+| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
@@ -447,13 +479,31 @@ found 4 SYCL devices:
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-4. Set device ID and execute llama.cpp
-Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
+4. Device selection and execution of llama.cpp
+
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:
```
-set GGML_SYCL_DEVICE=0
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+```
+
+- Use multiple devices:
+
+```
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
or run by script:
@@ -466,11 +516,17 @@ Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-5. Check the device ID in output
-Like:
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
+
```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
+```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
```
## Environment Variable
@@ -489,7 +545,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|Name|Value|Function|
|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
@@ -507,6 +562,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
## Q&A
+Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
+
+
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
@@ -538,4 +596,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
## Todo
-- Support multiple cards.
+- Support row layer split for multiple card runs.
diff --git a/README.md b/README.md
index c2f3342f0..f9cf19616 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
+- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
----
@@ -165,6 +167,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
diff --git a/build.zig b/build.zig
index 90609359b..a1de7083a 100644
--- a/build.zig
+++ b/build.zig
@@ -122,6 +122,7 @@ pub fn build(b: *std.build.Builder) !void {
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+ const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
@@ -133,7 +134,7 @@ pub fn build(b: *std.build.Builder) !void {
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
+ const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index af2629a46..1d840e5f7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,6 +47,8 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
+set(TARGET json-schema-to-grammar)
+add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
set(TARGET common)
@@ -60,8 +62,11 @@ add_library(${TARGET} STATIC
console.cpp
grammar-parser.h
grammar-parser.cpp
+ json.hpp
train.h
train.cpp
+ ngram-cache.h
+ ngram-cache.cpp
)
if (BUILD_SHARED_LIBS)
diff --git a/common/common.cpp b/common/common.cpp
index 2f5d965d6..9dec08430 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -39,6 +39,9 @@
#endif
#if defined(LLAMA_USE_CURL)
#include
+#include
+#include
+#include
#endif
#if defined(_MSC_VER)
@@ -61,7 +64,7 @@
#else
#include
#endif
-#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL
@@ -101,7 +104,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
-void process_escapes(std::string& input) {
+void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -154,6 +157,1072 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result;
}
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+ llama_sampling_params& sparams = params.sparams;
+
+ if (arg == "-s" || arg == "--seed") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.seed = std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "-t" || arg == "--threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads = std::stoi(argv[i]);
+ if (params.n_threads <= 0) {
+ params.n_threads = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-tb" || arg == "--threads-batch") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_batch = std::stoi(argv[i]);
+ if (params.n_threads_batch <= 0) {
+ params.n_threads_batch = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-td" || arg == "--threads-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_draft = std::stoi(argv[i]);
+ if (params.n_threads_draft <= 0) {
+ params.n_threads_draft = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-tbd" || arg == "--threads-batch-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_batch_draft = std::stoi(argv[i]);
+ if (params.n_threads_batch_draft <= 0) {
+ params.n_threads_batch_draft = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-p" || arg == "--prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.prompt = argv[i];
+ return true;
+ }
+ if (arg == "-e" || arg == "--escape") {
+ params.escape = true;
+ return true;
+ }
+ if (arg == "--prompt-cache") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.path_prompt_cache = argv[i];
+ return true;
+ }
+ if (arg == "--prompt-cache-all") {
+ params.prompt_cache_all = true;
+ return true;
+ }
+ if (arg == "--prompt-cache-ro") {
+ params.prompt_cache_ro = true;
+ return true;
+ }
+ if (arg == "-bf" || arg == "--binary-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ // store the external file name in params
+ params.prompt_file = argv[i];
+ std::ostringstream ss;
+ ss << file.rdbuf();
+ params.prompt = ss.str();
+ fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
+ return true;
+ }
+ if (arg == "-f" || arg == "--file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ // store the external file name in params
+ params.prompt_file = argv[i];
+ std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt));
+ if (!params.prompt.empty() && params.prompt.back() == '\n') {
+ params.prompt.pop_back();
+ }
+ return true;
+ }
+ if (arg == "-n" || arg == "--n-predict") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_predict = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--top-k") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.top_k = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-c" || arg == "--ctx-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_ctx = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--grp-attn-n" || arg == "-gan") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.grp_attn_n = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--grp-attn-w" || arg == "-gaw") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.grp_attn_w = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-freq-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_base = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-freq-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_scale = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-scaling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+ else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+ else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--rope-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_scale = 1.0f / std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-orig-ctx") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_orig_ctx = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-ext-factor") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_ext_factor = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-attn-factor") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_attn_factor = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-beta-fast") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_beta_fast = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-beta-slow") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_beta_slow = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--pooling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--defrag-thold" || arg == "-dt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.defrag_thold = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--samplers") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const auto sampler_names = string_split(argv[i], ';');
+ sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
+ return true;
+ }
+ if (arg == "--sampling-seq") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
+ return true;
+ }
+ if (arg == "--top-p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.top_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--min-p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.min_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--temp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.temp = std::stof(argv[i]);
+ sparams.temp = std::max(sparams.temp, 0.0f);
+ return true;
+ }
+ if (arg == "--tfs") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.tfs_z = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--typical") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.typical_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--repeat-last-n") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_last_n = std::stoi(argv[i]);
+ sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
+ return true;
+ }
+ if (arg == "--repeat-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_repeat = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--frequency-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_freq = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--presence-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_present = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--dynatemp-range") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.dynatemp_range = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--dynatemp-exp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.dynatemp_exponent = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat-lr") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat_eta = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat-ent") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat_tau = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--cfg-negative-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.cfg_negative_prompt = argv[i];
+ return true;
+ }
+ if (arg == "--cfg-negative-prompt-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt));
+ if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
+ sparams.cfg_negative_prompt.pop_back();
+ }
+ return true;
+ }
+ if (arg == "--cfg-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.cfg_scale = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "-b" || arg == "--batch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_batch = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_ubatch = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--keep") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_keep = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_draft = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--chunks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_chunks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-np" || arg == "--parallel") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_parallel = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ns" || arg == "--sequences") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_sequences = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--p-split" || arg == "-ps") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.p_split = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model = argv[i];
+ return true;
+ }
+ if (arg == "-md" || arg == "--model-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_draft = argv[i];
+ return true;
+ }
+ if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_alias = argv[i];
+ return true;
+ }
+ if (arg == "-mu" || arg == "--model-url") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_url = argv[i];
+ return true;
+ }
+ if (arg == "-hfr" || arg == "--hf-repo") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_repo = argv[i];
+ return true;
+ }
+ if (arg == "-hff" || arg == "--hf-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_file = argv[i];
+ return true;
+ }
+ if (arg == "--lora") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_adapter.emplace_back(argv[i], 1.0f);
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--lora-scaled") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const char* lora_adapter = argv[i];
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--lora-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_base = argv[i];
+ return true;
+ }
+ if (arg == "--control-vector") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vectors.push_back({ 1.0f, argv[i], });
+ return true;
+ }
+ if (arg == "--control-vector-scaled") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const char* fname = argv[i];
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vectors.push_back({ std::stof(argv[i]), fname, });
+ return true;
+ }
+ if (arg == "--control-vector-layer-range") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vector_layer_start = std::stoi(argv[i]);
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vector_layer_end = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--mmproj") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.mmproj = argv[i];
+ return true;
+ }
+ if (arg == "--image") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.image = argv[i];
+ return true;
+ }
+ if (arg == "-i" || arg == "--interactive") {
+ params.interactive = true;
+ return true;
+ }
+ if (arg == "--embedding") {
+ params.embedding = true;
+ return true;
+ }
+ if (arg == "--interactive-first") {
+ params.interactive_first = true;
+ return true;
+ }
+ if (arg == "-ins" || arg == "--instruct") {
+ params.instruct = true;
+ return true;
+ }
+ if (arg == "-cml" || arg == "--chatml") {
+ params.chatml = true;
+ return true;
+ }
+ if (arg == "--infill") {
+ params.infill = true;
+ return true;
+ }
+ if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+ params.dump_kv_cache = true;
+ return true;
+ }
+ if (arg == "-nkvo" || arg == "--no-kv-offload") {
+ params.no_kv_offload = true;
+ return true;
+ }
+ if (arg == "-ctk" || arg == "--cache-type-k") {
+ params.cache_type_k = argv[++i];
+ return true;
+ }
+ if (arg == "-ctv" || arg == "--cache-type-v") {
+ params.cache_type_v = argv[++i];
+ return true;
+ }
+ if (arg == "--multiline-input") {
+ params.multiline_input = true;
+ return true;
+ }
+ if (arg == "--simple-io") {
+ params.simple_io = true;
+ return true;
+ }
+ if (arg == "-cb" || arg == "--cont-batching") {
+ params.cont_batching = true;
+ return true;
+ }
+ if (arg == "--color") {
+ params.use_color = true;
+ return true;
+ }
+ if (arg == "--mlock") {
+ params.use_mlock = true;
+ return true;
+ }
+ if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_gpu_layers = std::stoi(argv[i]);
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
+ return true;
+ }
+ if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
+ return true;
+ }
+ if (arg == "--main-gpu" || arg == "-mg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.main_gpu = std::stoi(argv[i]);
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+ return true;
+ }
+ if (arg == "--split-mode" || arg == "-sm") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string arg_next = argv[i];
+ if (arg_next == "none") {
+ params.split_mode = LLAMA_SPLIT_MODE_NONE;
+ }
+ else if (arg_next == "layer") {
+ params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+ }
+ else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+ fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+ exit(1);
+#endif // GGML_USE_SYCL
+ params.split_mode = LLAMA_SPLIT_MODE_ROW;
+ }
+ else {
+ invalid_param = true;
+ return true;
+ }
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+ return true;
+ }
+ if (arg == "--tensor-split" || arg == "-ts") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string arg_next = argv[i];
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ invalid_param = true;
+ return true;
+ }
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
+ if (i < split_arg.size()) {
+ params.tensor_split[i] = std::stof(split_arg[i]);
+ }
+ else {
+ params.tensor_split[i] = 0.0f;
+ }
+ }
+#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+ return true;
+ }
+ if (arg == "--no-mmap") {
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--numa") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--verbose-prompt") {
+ params.verbose_prompt = true;
+ return true;
+ }
+ if (arg == "--no-display-prompt") {
+ params.display_prompt = false;
+ return true;
+ }
+ if (arg == "-r" || arg == "--reverse-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.antiprompt.emplace_back(argv[i]);
+ return true;
+ }
+ if (arg == "-ld" || arg == "--logdir") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.logdir = argv[i];
+
+ if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+ params.logdir += DIRECTORY_SEPARATOR;
+ }
+ return true;
+ }
+ if (arg == "-lcs" || arg == "--lookup-cache-static") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_static = argv[i];
+ return true;
+ }
+ if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_dynamic = argv[i];
+ return true;
+ }
+ if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.logits_file = argv[i];
+ return true;
+ }
+ if (arg == "--perplexity" || arg == "--all-logits") {
+ params.logits_all = true;
+ return true;
+ }
+ if (arg == "--ppl-stride") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_stride = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ptc" || arg == "--print-token-count") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_print = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--ppl-output-type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_output_type = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--hellaswag") {
+ params.hellaswag = true;
+ return true;
+ }
+ if (arg == "--hellaswag-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hellaswag_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--winogrande") {
+ params.winogrande = true;
+ return true;
+ }
+ if (arg == "--winogrande-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.winogrande_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--multiple-choice") {
+ params.multiple_choice = true;
+ return true;
+ }
+ if (arg == "--multiple-choice-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.multiple_choice_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--kl-divergence") {
+ params.kl_divergence = true;
+ return true;
+ }
+ if (arg == "--ignore-eos") {
+ params.ignore_eos = true;
+ return true;
+ }
+ if (arg == "--no-penalize-nl") {
+ sparams.penalize_nl = false;
+ return true;
+ }
+ if (arg == "-l" || arg == "--logit-bias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::stringstream ss(argv[i]);
+ llama_token key;
+ char sign;
+ std::string value_str;
+ try {
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+ sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+ }
+ else {
+ throw std::exception();
+ }
+ }
+ catch (const std::exception&) {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ if (arg == "-h" || arg == "--help") {
+ gpt_print_usage(argc, argv, gpt_params());
+ exit(0);
+ }
+ if (arg == "--version") {
+ fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+ fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+ exit(0);
+ }
+ if (arg == "--random-prompt") {
+ params.random_prompt = true;
+ return true;
+ }
+ if (arg == "--in-prefix-bos") {
+ params.input_prefix_bos = true;
+ return true;
+ }
+ if (arg == "--in-prefix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.input_prefix = argv[i];
+ return true;
+ }
+ if (arg == "--in-suffix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.input_suffix = argv[i];
+ return true;
+ }
+ if (arg == "--grammar") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.grammar = argv[i];
+ return true;
+ }
+ if (arg == "--grammar-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::copy(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator(),
+ std::back_inserter(sparams.grammar)
+ );
+ return true;
+ }
+ if (arg == "--override-kv") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ char* sep = strchr(argv[i], '=');
+ if (sep == nullptr || sep - argv[i] >= 128) {
+ fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ struct llama_model_kv_override kvo;
+ std::strncpy(kvo.key, argv[i], sep - argv[i]);
+ kvo.key[sep - argv[i]] = 0;
+ sep++;
+ if (strncmp(sep, "int:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.int_value = std::atol(sep);
+ }
+ else if (strncmp(sep, "float:", 6) == 0) {
+ sep += 6;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+ kvo.float_value = std::atof(sep);
+ }
+ else if (strncmp(sep, "bool:", 5) == 0) {
+ sep += 5;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+ if (std::strcmp(sep, "true") == 0) {
+ kvo.bool_value = true;
+ }
+ else if (std::strcmp(sep, "false") == 0) {
+ kvo.bool_value = false;
+ }
+ else {
+ fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ }
+ else {
+ fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.kv_overrides.push_back(kvo);
+ return true;
+ }
+#ifndef LOG_DISABLE_LOGS
+ // Parse args for logging parameters
+ if (log_param_single_parse(argv[i])) {
+ // Do nothing, log_param_single_parse automatically does it's thing
+ // and returns if a match was found and parsed.
+ return true;
+ }
+ if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
+ // We have a matching known parameter requiring an argument,
+ // now we need to check if there is anything after this argv
+ // and flag invalid_param or parse it.
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ // End of Parse args for logging parameters
+#endif // LOG_DISABLE_LOGS
+
+ return false;
+}
+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
@@ -166,1034 +1235,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
- bool arg_found = false;
- if (arg == "-s" || arg == "--seed") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.seed = std::stoul(argv[i]);
- }
- if (arg == "-t" || arg == "--threads") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads = std::stoi(argv[i]);
- if (params.n_threads <= 0) {
- params.n_threads = std::thread::hardware_concurrency();
- }
- }
- if (arg == "-tb" || arg == "--threads-batch") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads_batch = std::stoi(argv[i]);
- if (params.n_threads_batch <= 0) {
- params.n_threads_batch = std::thread::hardware_concurrency();
- }
- }
- if (arg == "-td" || arg == "--threads-draft") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads_draft = std::stoi(argv[i]);
- if (params.n_threads_draft <= 0) {
- params.n_threads_draft = std::thread::hardware_concurrency();
- }
- }
- if (arg == "-tbd" || arg == "--threads-batch-draft") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_threads_batch_draft = std::stoi(argv[i]);
- if (params.n_threads_batch_draft <= 0) {
- params.n_threads_batch_draft = std::thread::hardware_concurrency();
- }
- }
- if (arg == "-p" || arg == "--prompt") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.prompt = argv[i];
- }
- if (arg == "-e" || arg == "--escape") {
- arg_found = true;
- params.escape = true;
- }
- if (arg == "--prompt-cache") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.path_prompt_cache = argv[i];
- }
- if (arg == "--prompt-cache-all") {
- arg_found = true;
- params.prompt_cache_all = true;
- }
- if (arg == "--prompt-cache-ro") {
- arg_found = true;
- params.prompt_cache_ro = true;
- }
- if (arg == "-bf" || arg == "--binary-file") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i], std::ios::binary);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- // store the external file name in params
- params.prompt_file = argv[i];
- std::ostringstream ss;
- ss << file.rdbuf();
- params.prompt = ss.str();
- fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
- }
- if (arg == "-f" || arg == "--file") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- // store the external file name in params
- params.prompt_file = argv[i];
- std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt));
- if (!params.prompt.empty() && params.prompt.back() == '\n') {
- params.prompt.pop_back();
- }
- }
- if (arg == "-n" || arg == "--n-predict") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_predict = std::stoi(argv[i]);
- }
- if (arg == "--top-k") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.top_k = std::stoi(argv[i]);
- }
- if (arg == "-c" || arg == "--ctx-size") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ctx = std::stoi(argv[i]);
- }
- if (arg == "--grp-attn-n" || arg == "-gan") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_n = std::stoi(argv[i]);
- }
- if (arg == "--grp-attn-w" || arg == "-gaw") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
-
- params.grp_attn_w = std::stoi(argv[i]);
- }
- if (arg == "--rope-freq-base") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_base = std::stof(argv[i]);
- }
- if (arg == "--rope-freq-scale") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_scale = std::stof(argv[i]);
- }
- if (arg == "--rope-scaling") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
- else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
- else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
- else { invalid_param = true; break; }
- }
- if (arg == "--rope-scale") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.rope_freq_scale = 1.0f/std::stof(argv[i]);
- }
- if (arg == "--yarn-orig-ctx") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_orig_ctx = std::stoi(argv[i]);
- }
- if (arg == "--yarn-ext-factor") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_ext_factor = std::stof(argv[i]);
- }
- if (arg == "--yarn-attn-factor") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_attn_factor = std::stof(argv[i]);
- }
- if (arg == "--yarn-beta-fast") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_fast = std::stof(argv[i]);
- }
- if (arg == "--yarn-beta-slow") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.yarn_beta_slow = std::stof(argv[i]);
- }
- if (arg == "--pooling") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
- else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
- else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
- else { invalid_param = true; break; }
- }
- if (arg == "--defrag-thold" || arg == "-dt") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.defrag_thold = std::stof(argv[i]);
- }
- if (arg == "--samplers") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- const auto sampler_names = string_split(argv[i], ';');
- sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
- }
- if (arg == "--sampling-seq") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
- }
- if (arg == "--top-p") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.top_p = std::stof(argv[i]);
- }
- if (arg == "--min-p") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.min_p = std::stof(argv[i]);
- }
- if (arg == "--temp") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.temp = std::stof(argv[i]);
- sparams.temp = std::max(sparams.temp, 0.0f);
- }
- if (arg == "--tfs") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.tfs_z = std::stof(argv[i]);
- }
- if (arg == "--typical") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.typical_p = std::stof(argv[i]);
- }
- if (arg == "--repeat-last-n") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.penalty_last_n = std::stoi(argv[i]);
- sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
- }
- if (arg == "--repeat-penalty") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.penalty_repeat = std::stof(argv[i]);
- }
- if (arg == "--frequency-penalty") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.penalty_freq = std::stof(argv[i]);
- }
- if (arg == "--presence-penalty") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.penalty_present = std::stof(argv[i]);
- }
- if (arg == "--dynatemp-range") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.dynatemp_range = std::stof(argv[i]);
- }
- if (arg == "--dynatemp-exp") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.dynatemp_exponent = std::stof(argv[i]);
- }
- if (arg == "--mirostat") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.mirostat = std::stoi(argv[i]);
- }
- if (arg == "--mirostat-lr") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.mirostat_eta = std::stof(argv[i]);
- }
- if (arg == "--mirostat-ent") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.mirostat_tau = std::stof(argv[i]);
- }
- if (arg == "--cfg-negative-prompt") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.cfg_negative_prompt = argv[i];
- }
- if (arg == "--cfg-negative-prompt-file") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt));
- if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
- sparams.cfg_negative_prompt.pop_back();
- }
- }
- if (arg == "--cfg-scale") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.cfg_scale = std::stof(argv[i]);
- }
- if (arg == "-b" || arg == "--batch-size") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_batch = std::stoi(argv[i]);
- }
- if (arg == "-ub" || arg == "--ubatch-size") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_ubatch = std::stoi(argv[i]);
- }
- if (arg == "--keep") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_keep = std::stoi(argv[i]);
- }
- if (arg == "--draft") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_draft = std::stoi(argv[i]);
- }
- if (arg == "--chunks") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_chunks = std::stoi(argv[i]);
- }
- if (arg == "-np" || arg == "--parallel") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_parallel = std::stoi(argv[i]);
- }
- if (arg == "-ns" || arg == "--sequences") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_sequences = std::stoi(argv[i]);
- }
- if (arg == "--p-split" || arg == "-ps") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.p_split = std::stof(argv[i]);
- }
- if (arg == "-m" || arg == "--model") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model = argv[i];
- }
- if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_url = argv[i];
- }
- if (arg == "-md" || arg == "--model-draft") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_draft = argv[i];
- }
- if (arg == "-a" || arg == "--alias") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model_alias = argv[i];
- }
- if (arg == "--lora") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(argv[i], 1.0f);
- params.use_mmap = false;
- }
- if (arg == "--lora-scaled") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- const char * lora_adapter = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
- params.use_mmap = false;
- }
- if (arg == "--lora-base") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.lora_base = argv[i];
- }
- if (arg == "--control-vector") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.control_vectors.push_back({ 1.0f, argv[i], });
- }
- if (arg == "--control-vector-scaled") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- const char * fname = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.control_vectors.push_back({ std::stof(argv[i]), fname, });
- }
- if (arg == "--control-vector-layer-range") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.control_vector_layer_start = std::stoi(argv[i]);
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.control_vector_layer_end = std::stoi(argv[i]);
- }
- if (arg == "--mmproj") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.mmproj = argv[i];
- }
- if (arg == "--image") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.image = argv[i];
- }
- if (arg == "-i" || arg == "--interactive") {
- arg_found = true;
- params.interactive = true;
- }
- if (arg == "--embedding") {
- arg_found = true;
- params.embedding = true;
- }
- if (arg == "--interactive-first") {
- arg_found = true;
- params.interactive_first = true;
- }
- if (arg == "-ins" || arg == "--instruct") {
- arg_found = true;
- params.instruct = true;
- }
- if (arg == "-cml" || arg == "--chatml") {
- arg_found = true;
- params.chatml = true;
- }
- if (arg == "--infill") {
- arg_found = true;
- params.infill = true;
- }
- if (arg == "-dkvc" || arg == "--dump-kv-cache") {
- arg_found = true;
- params.dump_kv_cache = true;
- }
- if (arg == "-nkvo" || arg == "--no-kv-offload") {
- arg_found = true;
- params.no_kv_offload = true;
- }
- if (arg == "-ctk" || arg == "--cache-type-k") {
- arg_found = true;
- params.cache_type_k = argv[++i];
- }
- if (arg == "-ctv" || arg == "--cache-type-v") {
- arg_found = true;
- params.cache_type_v = argv[++i];
- }
- if (arg == "--multiline-input") {
- arg_found = true;
- params.multiline_input = true;
- }
- if (arg == "--simple-io") {
- arg_found = true;
- params.simple_io = true;
- }
- if (arg == "-cb" || arg == "--cont-batching") {
- arg_found = true;
- params.cont_batching = true;
- }
- if (arg == "--color") {
- arg_found = true;
- params.use_color = true;
- }
- if (arg == "--mlock") {
- arg_found = true;
- params.use_mlock = true;
- }
- if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_gpu_layers = std::stoi(argv[i]);
- if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
- }
- }
- if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_gpu_layers_draft = std::stoi(argv[i]);
- if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
- }
- }
- if (arg == "--main-gpu" || arg == "-mg") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
- }
- if (arg == "--split-mode" || arg == "-sm") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string arg_next = argv[i];
- if (arg_next == "none") {
- params.split_mode = LLAMA_SPLIT_MODE_NONE;
- } else if (arg_next == "layer") {
- params.split_mode = LLAMA_SPLIT_MODE_LAYER;
- } else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
- fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
- exit(1);
-#endif // GGML_USE_SYCL
- params.split_mode = LLAMA_SPLIT_MODE_ROW;
- } else {
- invalid_param = true;
- break;
- }
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
-
- }
- if (arg == "--tensor-split" || arg == "-ts") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string arg_next = argv[i];
-
- // split string by , and /
- const std::regex regex{R"([,/]+)"};
- std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
- std::vector split_arg{it, {}};
- if (split_arg.size() >= llama_max_devices()) {
- invalid_param = true;
- break;
- }
- for (size_t i = 0; i < llama_max_devices(); ++i) {
- if (i < split_arg.size()) {
- params.tensor_split[i] = std::stof(split_arg[i]);
- } else {
- params.tensor_split[i] = 0.0f;
- }
- }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
- }
- if (arg == "--no-mmap") {
- arg_found = true;
- params.use_mmap = false;
- }
- if (arg == "--numa") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::string value(argv[i]);
- /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
- else { invalid_param = true; break; }
- }
- if (arg == "--verbose-prompt") {
- arg_found = true;
- params.verbose_prompt = true;
- }
- if (arg == "--no-display-prompt") {
- arg_found = true;
- params.display_prompt = false;
- }
- if (arg == "-r" || arg == "--reverse-prompt") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.antiprompt.emplace_back(argv[i]);
- }
- if (arg == "-ld" || arg == "--logdir") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.logdir = argv[i];
-
- if (params.logdir.back() != DIRECTORY_SEPARATOR) {
- params.logdir += DIRECTORY_SEPARATOR;
- }
- }
- if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.logits_file = argv[i];
- }
- if (arg == "--perplexity" || arg == "--all-logits") {
- arg_found = true;
- params.logits_all = true;
- }
- if (arg == "--ppl-stride") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.ppl_stride = std::stoi(argv[i]);
- }
- if (arg == "-ptc" || arg == "--print-token-count") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.n_print = std::stoi(argv[i]);
- }
- if (arg == "--ppl-output-type") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.ppl_output_type = std::stoi(argv[i]);
- }
- if (arg == "--hellaswag") {
- arg_found = true;
- params.hellaswag = true;
- }
- if (arg == "--hellaswag-tasks") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.hellaswag_tasks = std::stoi(argv[i]);
- }
- if (arg == "--winogrande") {
- arg_found = true;
- params.winogrande = true;
- }
- if (arg == "--winogrande-tasks") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.winogrande_tasks = std::stoi(argv[i]);
- }
- if (arg == "--multiple-choice") {
- arg_found = true;
- params.multiple_choice = true;
- }
- if (arg == "--multiple-choice-tasks") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.multiple_choice_tasks = std::stoi(argv[i]);
- }
- if (arg == "--kl-divergence") {
- arg_found = true;
- params.kl_divergence = true;
- }
- if (arg == "--ignore-eos") {
- arg_found = true;
- params.ignore_eos = true;
- }
- if (arg == "--no-penalize-nl") {
- arg_found = true;
- sparams.penalize_nl = false;
- }
- if (arg == "-l" || arg == "--logit-bias") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::stringstream ss(argv[i]);
- llama_token key;
- char sign;
- std::string value_str;
- try {
- if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
- sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
- } else {
- throw std::exception();
- }
- } catch (const std::exception&) {
- invalid_param = true;
- break;
- }
- }
- if (arg == "-h" || arg == "--help") {
- arg_found = true;
- return false;
- }
- if (arg == "--version") {
- arg_found = true;
- fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
- fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
- exit(0);
- }
- if (arg == "--random-prompt") {
- arg_found = true;
- params.random_prompt = true;
- }
- if (arg == "--in-prefix-bos") {
- arg_found = true;
- params.input_prefix_bos = true;
- }
- if (arg == "--in-prefix") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.input_prefix = argv[i];
- }
- if (arg == "--in-suffix") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.input_suffix = argv[i];
- }
- if (arg == "--grammar") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- sparams.grammar = argv[i];
- }
- if (arg == "--grammar-file") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- std::ifstream file(argv[i]);
- if (!file) {
- fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
- invalid_param = true;
- break;
- }
- std::copy(
- std::istreambuf_iterator(file),
- std::istreambuf_iterator(),
- std::back_inserter(sparams.grammar)
- );
- }
- if (arg == "--override-kv") {
- arg_found = true;
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- char * sep = strchr(argv[i], '=');
- if (sep == nullptr || sep - argv[i] >= 128) {
- fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- struct llama_model_kv_override kvo;
- std::strncpy(kvo.key, argv[i], sep - argv[i]);
- kvo.key[sep - argv[i]] = 0;
- sep++;
- if (strncmp(sep, "int:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
- kvo.int_value = std::atol(sep);
- } else if (strncmp(sep, "float:", 6) == 0) {
- sep += 6;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
- kvo.float_value = std::atof(sep);
- } else if (strncmp(sep, "bool:", 5) == 0) {
- sep += 5;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
- if (std::strcmp(sep, "true") == 0) {
- kvo.bool_value = true;
- } else if (std::strcmp(sep, "false") == 0) {
- kvo.bool_value = false;
- } else {
- fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- } else {
- fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- params.kv_overrides.push_back(kvo);
-#ifndef LOG_DISABLE_LOGS
- // Parse args for logging parameters
- }
- if ( log_param_single_parse( argv[i] ) ) {
- arg_found = true;
- // Do nothing, log_param_single_parse automatically does it's thing
- // and returns if a match was found and parsed.
- }
- if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
- arg_found = true;
- // We have a matching known parameter requiring an argument,
- // now we need to check if there is anything after this argv
- // and flag invalid_param or parse it.
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
- invalid_param = true;
- break;
- }
- // End of Parse args for logging parameters
-#endif // LOG_DISABLE_LOGS
- }
-
- if (!arg_found) {
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
+
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
+
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
@@ -1201,6 +1251,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (!params.hf_repo.empty() && params.hf_file.empty()) {
+ params.hf_file = params.model;
+ }
+
if (params.escape) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
@@ -1390,12 +1445,20 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: %s)\n", params.model_url.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
- printf(" draft model for speculative decoding\n");
+ printf(" draft model for speculative decoding (default: unused)\n");
+ printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
+ printf(" model download url (default: unused)\n");
+ printf(" -hfr REPO, --hf-repo REPO\n");
+ printf(" Hugging Face model repository (default: unused)\n");
+ printf(" -hff FILE, --hf-file FILE\n");
+ printf(" Hugging Face model file (default: unused)\n");
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
+ printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
+ printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+ printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+ printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
@@ -1576,6 +1639,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
+ if (s == "iq4_nl") {
+ return GGML_TYPE_IQ4_NL;
+ }
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
@@ -1639,25 +1705,13 @@ void llama_batch_add(
#ifdef LLAMA_USE_CURL
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
- struct llama_model_params params) {
- // Basic validation of the model_url
- if (!model_url || strlen(model_url) == 0) {
- fprintf(stderr, "%s: invalid model_url\n", __func__);
- return NULL;
- }
-
- // Initialize libcurl globally
- auto curl = curl_easy_init();
-
- if (!curl) {
- fprintf(stderr, "%s: error initializing libcurl\n", __func__);
- return NULL;
- }
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+ bool force_download = false;
// Set the URL, allow to follow http redirection
- curl_easy_setopt(curl, CURLOPT_URL, model_url);
+ curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
@@ -1666,16 +1720,16 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
// Check if the file already exists locally
struct stat model_file_info;
- auto file_exists = (stat(path_model, &model_file_info) == 0);
+ auto file_exists = (stat(path, &model_file_info) == 0);
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
- char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
+ char etag_path[PATH_MAX] = {0};
+ snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
- char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
+ char last_modified_path[PATH_MAX] = {0};
+ snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
if (file_exists) {
auto * f_etag = fopen(etag_path, "r");
@@ -1683,7 +1737,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (!fgets(etag, sizeof(etag), f_etag)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
} else {
- fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
}
fclose(f_etag);
}
@@ -1693,7 +1747,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
} else {
- fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
last_modified);
}
fclose(f_last_modified);
@@ -1711,6 +1765,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+ // Convert header field name to lowercase
+ for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+ buffer[i] = tolower(buffer[i]);
+ }
+
const char * etag_prefix = "etag: ";
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@@ -1733,7 +1792,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (res != CURLE_OK) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
- return NULL;
+ return false;
}
long http_code = 0;
@@ -1741,30 +1800,34 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
- file_exists = false;
+ force_download = true;
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
// If the ETag or the Last-Modified headers are different: trigger a new download
- if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
- char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+ bool should_download = !file_exists
+ || force_download
+ || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+ || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+ if (should_download) {
+ char path_temporary[PATH_MAX] = {0};
+ snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
if (file_exists) {
- fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
- if (remove(path_model) != 0) {
+ fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+ if (remove(path) != 0) {
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
- return NULL;
+ fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+ return false;
}
}
// Set the output file
- auto * outfile = fopen(path_model_temporary, "wb");
+ auto * outfile = fopen(path_temporary, "wb");
if (!outfile) {
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
- return NULL;
+ fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+ return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@@ -1778,15 +1841,30 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
// display download progress
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+ // helper function to hide password in URL
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+ std::size_t protocol_pos = url.find("://");
+ if (protocol_pos == std::string::npos) {
+ return url; // Malformed URL
+ }
+
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
+ if (at_pos == std::string::npos) {
+ return url; // No password in URL
+ }
+
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+ };
+
// start the download
- fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
- model_url, path_model, headers.etag, headers.last_modified);
+ fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+ llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
auto res = curl_easy_perform(curl);
if (res != CURLE_OK) {
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
- return NULL;
+ return false;
}
long http_code = 0;
@@ -1795,7 +1873,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
- return NULL;
+ return false;
}
// Clean up
@@ -1807,7 +1885,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (etag_file) {
fputs(headers.etag, etag_file);
fclose(etag_file);
- fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
+ fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
}
}
@@ -1817,42 +1895,177 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (last_modified_file) {
fputs(headers.last_modified, last_modified_file);
fclose(last_modified_file);
- fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
+ fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
headers.last_modified);
}
}
- if (rename(path_model_temporary, path_model) != 0) {
+ if (rename(path_temporary, path) != 0) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+struct llama_model * llama_load_model_from_url(
+ const char * model_url,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // Basic validation of the model_url
+ if (!model_url || strlen(model_url) == 0) {
+ fprintf(stderr, "%s: invalid model_url\n", __func__);
+ return NULL;
+ }
+
+ // Initialize libcurl
+ auto * curl = curl_easy_init();
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!llama_download_file(curl, model_url, path_model)) {
+ return NULL;
+ }
+
+ // check for additional GGUFs split to download
+ int n_split = 0;
+ {
+ struct gguf_init_params gguf_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ NULL,
+ };
+ auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
return NULL;
}
+
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+ if (key_n_split >= 0) {
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+ }
+
+ gguf_free(ctx_gguf);
}
curl_easy_cleanup(curl);
+ if (n_split > 1) {
+ char split_prefix[PATH_MAX] = {0};
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+ // Verify the first split file format
+ // and extract split URL and PATH prefixes
+ {
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model file name: %s"
+ " n_split=%d\n", __func__, path_model, n_split);
+ return NULL;
+ }
+
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model url: %s"
+ " n_split=%d\n", __func__, model_url, n_split);
+ return NULL;
+ }
+ }
+
+ // Prepare download in parallel
+ std::vector> futures_download;
+ for (int idx = 1; idx < n_split; idx++) {
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+ char split_path[PATH_MAX] = {0};
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+ auto * curl = curl_easy_init();
+ bool res = llama_download_file(curl, split_url, split_path);
+ curl_easy_cleanup(curl);
+
+ return res;
+ }, idx));
+ }
+
+ // Wait for all downloads to complete
+ for (auto & f : futures_download) {
+ if (!f.get()) {
+ return NULL;
+ }
+ }
+ }
+
return llama_load_model_from_file(path_model, params);
}
+struct llama_model * llama_load_model_from_hf(
+ const char * repo,
+ const char * model,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // construct hugging face model url:
+ //
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+ //
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+ //
+
+ std::string model_url = "https://huggingface.co/";
+ model_url += repo;
+ model_url += "/resolve/main/";
+ model_url += model;
+
+ return llama_load_model_from_url(model_url.c_str(), path_model, params);
+}
+
#else
-struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
- struct llama_model_params /*params*/) {
+struct llama_model * llama_load_model_from_url(
+ const char * /*model_url*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
+struct llama_model * llama_load_model_from_hf(
+ const char * /*repo*/,
+ const char * /*model*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+ return nullptr;
+}
+
#endif // LLAMA_USE_CURL
std::tuple llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = nullptr;
- if (!params.model_url.empty()) {
+
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ } else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
+
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
@@ -1892,7 +2105,7 @@ std::tuple llama_init_from_gpt_par
}
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
- const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
int err = llama_model_apply_lora_from_file(model,
lora_adapter.c_str(),
diff --git a/common/common.h b/common/common.h
index 8dd8a3edc..99ee90bc3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,18 +88,22 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;
- std::string model = "models/7B/ggml-model-f16.gguf"; // model path
- std::string model_url = ""; // model url to download
- std::string model_draft = ""; // draft model for speculative decoding
- std::string model_alias = "unknown"; // model alias
- std::string prompt = "";
- std::string prompt_file = ""; // store the external prompt file name
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
- std::string input_prefix = ""; // string to prefix user inputs with
- std::string input_suffix = ""; // string to suffix user inputs with
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
+ std::string model_alias = "unknown"; // model alias
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
+ std::string prompt = "";
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
std::vector antiprompt; // string upon seeing which more user input is prompted
- std::string logdir = ""; // directory in which to save YAML log files
- std::string logits_file = ""; // file for saving *all* logits
+ std::string logdir = ""; // directory in which to save YAML log files
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+ std::string logits_file = ""; // file for saving *all* logits
std::vector kv_overrides;
@@ -139,7 +143,7 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
- bool cont_batching = false; // insert new sequences for decoding on-the-fly
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
@@ -167,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
@@ -192,8 +198,8 @@ std::tuple llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
- struct llama_model_params params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
// Batch utils
@@ -302,3 +308,10 @@ struct llama_control_vector_load_info {
// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector & load_infos);
+
+//
+// Split utils
+//
+static const char * const LLM_KV_SPLIT_NO = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
new file mode 100644
index 000000000..0e4680346
--- /dev/null
+++ b/common/json-schema-to-grammar.cpp
@@ -0,0 +1,721 @@
+#include "json-schema-to-grammar.h"
+#include
+#include
+#include