diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 992c34a03..0e7643bba 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,14 +15,133 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
+ macOS-latest-cmake-arm64:
+ runs-on: macos-14
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Dependencies
+ id: depends
+ continue-on-error: true
+ run: |
+ brew update
+
+ - name: Build
+ id: cmake_build
+ run: |
+ sysctl -a
+ mkdir build
+ cd build
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+
+ macOS-latest-cmake-x64:
+ runs-on: macos-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Dependencies
+ id: depends
+ continue-on-error: true
+ run: |
+ brew update
+
+ - name: Build
+ id: cmake_build
+ run: |
+ sysctl -a
+ mkdir build
+ cd build
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+
ubuntu-focal-make:
runs-on: ubuntu-20.04
+ env:
+ LLAMA_NODE_AVAILABLE: true
+ LLAMA_PYTHON_AVAILABLE: true
steps:
- name: Clone
@@ -35,6 +154,14 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential gcc-8
+ - uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - uses: actions/setup-python@v4
+ with:
+ python-version: "3.11"
+
- name: Build
id: make_build
env:
@@ -98,6 +225,17 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
+ - name: Test llama2c conversion
+ id: llama2c_test
+ run: |
+ cd build
+ echo "Fetch tokenizer"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+ echo "Fetch llama2c model"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+ ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+ ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
@@ -662,6 +800,7 @@ jobs:
windows-latest-cmake-sycl:
runs-on: windows-latest
+
defaults:
run:
shell: bash
@@ -670,7 +809,6 @@ jobs:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
steps:
- name: Clone
id: checkout
@@ -685,6 +823,32 @@ jobs:
id: cmake_build
run: examples/sycl/win-build-sycl.bat
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip .\build\bin\*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v3
+ with:
+ path: |
+ llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+
ios-xcode-build:
runs-on: macos-latest
@@ -748,6 +912,8 @@ jobs:
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
+ - macOS-latest-cmake-arm64
+ - macOS-latest-cmake-x64
steps:
- name: Clone
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index a151c6780..7f21daec0 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -19,5 +19,5 @@ jobs:
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
- operations-per-run: 1000
+ operations-per-run: 10000
repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
index 392db8a08..4112518bb 100644
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -5,6 +5,10 @@ env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
run:
runs-on: ubuntu-20.04
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 94f9161fc..9591bfc2a 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,6 +15,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index 0e0993cd4..7b2a00c90 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,6 +14,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
editorconfig:
runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index 8d0a3fd7f..109a793ea 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -17,6 +17,10 @@ on:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index 01c5a9d5a..8b5b99c8f 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -8,6 +8,10 @@ on:
pull_request:
types: [opened, synchronize, reopened]
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
nix-eval:
strategy:
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index b82205992..4092b12fa 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -16,6 +16,10 @@ on:
- 'requirements.txt'
- 'requirements/*.txt'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
python-check-requirements:
runs-on: ubuntu-latest
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index ea0a05ea1..4bdd79c4a 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -2,6 +2,10 @@ name: flake8 Lint
on: [push, pull_request]
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
flake8-lint:
runs-on: ubuntu-latest
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 65ca7d9ca..f07d25536 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -18,6 +18,10 @@ on:
schedule:
- cron: '0 0 * * *'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
server:
runs-on: ubuntu-latest
@@ -31,7 +35,6 @@ jobs:
include:
- build_type: Release
sanitizer: ""
- disabled_on_pr: true
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
index 68a698ab9..cb43954eb 100644
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -6,6 +6,10 @@ on:
branches:
- master
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
build:
strategy:
diff --git a/.gitignore b/.gitignore
index 1ad8d929b..9fb5b80c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,10 @@
*.gcda
*.dot
*.bat
+*.tmp
*.metallib
+*.etag
+*.lastModified
.DS_Store
.build/
.cache/
@@ -47,6 +50,7 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
+/gguf-split
/gritlm
/imatrix
/infill
@@ -55,6 +59,9 @@ models-mnt
/llava-cli
/lookahead
/lookup
+/lookup-create
+/lookup-merge
+/lookup-stats
/main
/metal
/passkey
@@ -70,6 +77,7 @@ models-mnt
/batched-bench
/export-lora
/finetune
+/retrieval
/speculative
/parallel
/train-text-from-scratch
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc4cff28f..3333ee1c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
+option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -387,6 +388,9 @@ if (LLAMA_CUBLAS)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+ if (LLAMA_CUDA_NO_PEER_COPY)
+ add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+ endif()
if (LLAMA_STATIC)
if (WIN32)
@@ -531,6 +535,10 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
+ if (LLAMA_CUDA_NO_PEER_COPY)
+ add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+ endif()
+
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
diff --git a/Makefile b/Makefile
index 838daf5c0..130fde838 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,16 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
+ tests/test-json-schema-to-grammar
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -451,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
else
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
+ifdef LLAMA_CUDA_NO_PEER_COPY
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
@@ -534,6 +535,9 @@ endif # LLAMA_HIP_UMA
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
+ifdef LLAMA_CUDA_NO_PEER_COPY
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
@@ -666,9 +670,15 @@ console.o: common/console.cpp common/console.h
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
train.o: common/train.cpp common/train.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@@ -676,7 +686,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
find examples pocs -type f -name "*.o" -delete
#
@@ -745,7 +755,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -753,6 +763,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -790,6 +804,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -802,9 +820,15 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -861,6 +885,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README-sycl.md b/README-sycl.md
index 9359a9490..cbf14f2da 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,6 +29,8 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## News
- 2024.3
+ - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
+ - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -81,7 +83,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec
|-|-|-|
|Ampere Series| Support| A100|
-### oneMKL
+### oneMKL for CUDA
The current oneMKL release does not contain the oneMKL cuBlas backend.
As a result for Nvidia GPU's oneMKL must be built from source.
@@ -114,7 +116,7 @@ You can choose between **F16** and **F32** build. F16 is faster for long-prompt
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
-# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
+# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
```
### Run
@@ -254,16 +256,16 @@ Run without parameter:
Check the ID in startup log, like:
```
-found 4 SYCL devices:
- Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
- Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
- max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
- Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
- max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
- Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
-
+found 6 SYCL devices:
+| | | |Compute |Max compute|Max work|Max sub| |
+|ID| Device Type| Name|capability|units |group |group |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
+| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
+| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
+| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
+| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
+| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
|Attribute|Note|
@@ -271,12 +273,35 @@ found 4 SYCL devices:
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-4. Set device ID and execute llama.cpp
+4. Device selection and execution of llama.cpp
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:
```sh
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+```
+or run by script:
+
+```sh
+./examples/sycl/run_llama2.sh 0
+```
+
+- Use multiple devices:
+
+```sh
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
or run by script:
@@ -289,12 +314,18 @@ Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-5. Check the device ID in output
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
-Like:
```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
+```
+
## Windows
@@ -355,7 +386,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install mingw-w64 make for Windows provided by w64devkit
-- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
- Extract `w64devkit` on your pc.
@@ -430,15 +461,16 @@ build\bin\main.exe
Check the ID in startup log, like:
```
-found 4 SYCL devices:
- Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
- Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
- max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
- Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
- max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
- Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
- max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+found 6 SYCL devices:
+| | | |Compute |Max compute|Max work|Max sub| |
+|ID| Device Type| Name|capability|units |group |group |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
+| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
+| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
+| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
+| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
+| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
@@ -447,13 +479,31 @@ found 4 SYCL devices:
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-4. Set device ID and execute llama.cpp
-Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
+4. Device selection and execution of llama.cpp
+
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:
```
-set GGML_SYCL_DEVICE=0
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+```
+
+- Use multiple devices:
+
+```
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
or run by script:
@@ -466,11 +516,17 @@ Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-5. Check the device ID in output
-Like:
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
+
```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
+```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
```
## Environment Variable
@@ -489,7 +545,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|Name|Value|Function|
|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
@@ -507,6 +562,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
## Q&A
+Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
+
+
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
@@ -538,4 +596,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
## Todo
-- Support multiple cards.
+- Support row layer split for multiple card runs.
diff --git a/README.md b/README.md
index c2f3342f0..f9cf19616 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
+- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
----
@@ -165,6 +167,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
diff --git a/build.zig b/build.zig
index 90609359b..a1de7083a 100644
--- a/build.zig
+++ b/build.zig
@@ -122,6 +122,7 @@ pub fn build(b: *std.build.Builder) !void {
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+ const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
@@ -133,7 +134,7 @@ pub fn build(b: *std.build.Builder) !void {
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
+ const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index af2629a46..1d840e5f7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,6 +47,8 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
+set(TARGET json-schema-to-grammar)
+add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
set(TARGET common)
@@ -60,8 +62,11 @@ add_library(${TARGET} STATIC
console.cpp
grammar-parser.h
grammar-parser.cpp
+ json.hpp
train.h
train.cpp
+ ngram-cache.h
+ ngram-cache.cpp
)
if (BUILD_SHARED_LIBS)
diff --git a/common/common.cpp b/common/common.cpp
index 5f10718ec..9dec08430 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -39,6 +39,9 @@
#endif
#if defined(LLAMA_USE_CURL)
#include
+#include
+#include
+#include
#endif
#if defined(_MSC_VER)
@@ -61,7 +64,7 @@
#else
#include
#endif
-#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL
@@ -101,7 +104,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
-void process_escapes(std::string& input) {
+void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -154,8 +157,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result;
}
-static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) {
- std::string arg = argv[i];
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
llama_sampling_params& sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
@@ -648,14 +650,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
params.model = argv[i];
return true;
}
- if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.model_url = argv[i];
- return true;
- }
if (arg == "-md" || arg == "--model-draft") {
if (++i >= argc) {
invalid_param = true;
@@ -672,6 +666,30 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
params.model_alias = argv[i];
return true;
}
+ if (arg == "-mu" || arg == "--model-url") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_url = argv[i];
+ return true;
+ }
+ if (arg == "-hfr" || arg == "--hf-repo") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_repo = argv[i];
+ return true;
+ }
+ if (arg == "-hff" || arg == "--hf-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_file = argv[i];
+ return true;
+ }
if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
@@ -948,6 +966,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
}
return true;
}
+ if (arg == "-lcs" || arg == "--lookup-cache-static") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_static = argv[i];
+ return true;
+ }
+ if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_dynamic = argv[i];
+ return true;
+ }
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
if (++i >= argc) {
invalid_param = true;
@@ -1201,13 +1235,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
- if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) {
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
+
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
+
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
@@ -1215,6 +1251,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (!params.hf_repo.empty() && params.hf_file.empty()) {
+ params.hf_file = params.model;
+ }
+
if (params.escape) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
@@ -1404,12 +1445,20 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: %s)\n", params.model_url.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
- printf(" draft model for speculative decoding\n");
+ printf(" draft model for speculative decoding (default: unused)\n");
+ printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
+ printf(" model download url (default: unused)\n");
+ printf(" -hfr REPO, --hf-repo REPO\n");
+ printf(" Hugging Face model repository (default: unused)\n");
+ printf(" -hff FILE, --hf-file FILE\n");
+ printf(" Hugging Face model file (default: unused)\n");
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
+ printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
+ printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+ printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+ printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
@@ -1590,6 +1639,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
+ if (s == "iq4_nl") {
+ return GGML_TYPE_IQ4_NL;
+ }
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
@@ -1653,25 +1705,13 @@ void llama_batch_add(
#ifdef LLAMA_USE_CURL
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
- struct llama_model_params params) {
- // Basic validation of the model_url
- if (!model_url || strlen(model_url) == 0) {
- fprintf(stderr, "%s: invalid model_url\n", __func__);
- return NULL;
- }
-
- // Initialize libcurl globally
- auto curl = curl_easy_init();
-
- if (!curl) {
- fprintf(stderr, "%s: error initializing libcurl\n", __func__);
- return NULL;
- }
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+ bool force_download = false;
// Set the URL, allow to follow http redirection
- curl_easy_setopt(curl, CURLOPT_URL, model_url);
+ curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
@@ -1680,16 +1720,16 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
// Check if the file already exists locally
struct stat model_file_info;
- auto file_exists = (stat(path_model, &model_file_info) == 0);
+ auto file_exists = (stat(path, &model_file_info) == 0);
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
- char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
+ char etag_path[PATH_MAX] = {0};
+ snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
- char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
+ char last_modified_path[PATH_MAX] = {0};
+ snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
if (file_exists) {
auto * f_etag = fopen(etag_path, "r");
@@ -1697,7 +1737,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (!fgets(etag, sizeof(etag), f_etag)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
} else {
- fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
}
fclose(f_etag);
}
@@ -1707,7 +1747,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
} else {
- fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
last_modified);
}
fclose(f_last_modified);
@@ -1725,6 +1765,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+ // Convert header field name to lowercase
+ for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+ buffer[i] = tolower(buffer[i]);
+ }
+
const char * etag_prefix = "etag: ";
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@@ -1747,7 +1792,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (res != CURLE_OK) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
- return NULL;
+ return false;
}
long http_code = 0;
@@ -1755,30 +1800,34 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
- file_exists = false;
+ force_download = true;
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
// If the ETag or the Last-Modified headers are different: trigger a new download
- if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
- char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
- snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+ bool should_download = !file_exists
+ || force_download
+ || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+ || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+ if (should_download) {
+ char path_temporary[PATH_MAX] = {0};
+ snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
if (file_exists) {
- fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
- if (remove(path_model) != 0) {
+ fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+ if (remove(path) != 0) {
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
- return NULL;
+ fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+ return false;
}
}
// Set the output file
- auto * outfile = fopen(path_model_temporary, "wb");
+ auto * outfile = fopen(path_temporary, "wb");
if (!outfile) {
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
- return NULL;
+ fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+ return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@@ -1792,15 +1841,30 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
// display download progress
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+ // helper function to hide password in URL
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+ std::size_t protocol_pos = url.find("://");
+ if (protocol_pos == std::string::npos) {
+ return url; // Malformed URL
+ }
+
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
+ if (at_pos == std::string::npos) {
+ return url; // No password in URL
+ }
+
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+ };
+
// start the download
- fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
- model_url, path_model, headers.etag, headers.last_modified);
+ fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+ llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
auto res = curl_easy_perform(curl);
if (res != CURLE_OK) {
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
- return NULL;
+ return false;
}
long http_code = 0;
@@ -1809,7 +1873,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
- return NULL;
+ return false;
}
// Clean up
@@ -1821,7 +1885,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (etag_file) {
fputs(headers.etag, etag_file);
fclose(etag_file);
- fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
+ fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
}
}
@@ -1831,42 +1895,177 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
if (last_modified_file) {
fputs(headers.last_modified, last_modified_file);
fclose(last_modified_file);
- fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
+ fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
headers.last_modified);
}
}
- if (rename(path_model_temporary, path_model) != 0) {
+ if (rename(path_temporary, path) != 0) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+struct llama_model * llama_load_model_from_url(
+ const char * model_url,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // Basic validation of the model_url
+ if (!model_url || strlen(model_url) == 0) {
+ fprintf(stderr, "%s: invalid model_url\n", __func__);
+ return NULL;
+ }
+
+ // Initialize libcurl
+ auto * curl = curl_easy_init();
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!llama_download_file(curl, model_url, path_model)) {
+ return NULL;
+ }
+
+ // check for additional GGUFs split to download
+ int n_split = 0;
+ {
+ struct gguf_init_params gguf_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ NULL,
+ };
+ auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
curl_easy_cleanup(curl);
- fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
return NULL;
}
+
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+ if (key_n_split >= 0) {
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+ }
+
+ gguf_free(ctx_gguf);
}
curl_easy_cleanup(curl);
+ if (n_split > 1) {
+ char split_prefix[PATH_MAX] = {0};
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+ // Verify the first split file format
+ // and extract split URL and PATH prefixes
+ {
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model file name: %s"
+ " n_split=%d\n", __func__, path_model, n_split);
+ return NULL;
+ }
+
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model url: %s"
+ " n_split=%d\n", __func__, model_url, n_split);
+ return NULL;
+ }
+ }
+
+ // Prepare download in parallel
+ std::vector> futures_download;
+ for (int idx = 1; idx < n_split; idx++) {
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+ char split_path[PATH_MAX] = {0};
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+ auto * curl = curl_easy_init();
+ bool res = llama_download_file(curl, split_url, split_path);
+ curl_easy_cleanup(curl);
+
+ return res;
+ }, idx));
+ }
+
+ // Wait for all downloads to complete
+ for (auto & f : futures_download) {
+ if (!f.get()) {
+ return NULL;
+ }
+ }
+ }
+
return llama_load_model_from_file(path_model, params);
}
+struct llama_model * llama_load_model_from_hf(
+ const char * repo,
+ const char * model,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // construct hugging face model url:
+ //
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+ //
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+ //
+
+ std::string model_url = "https://huggingface.co/";
+ model_url += repo;
+ model_url += "/resolve/main/";
+ model_url += model;
+
+ return llama_load_model_from_url(model_url.c_str(), path_model, params);
+}
+
#else
-struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
- struct llama_model_params /*params*/) {
+struct llama_model * llama_load_model_from_url(
+ const char * /*model_url*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
+struct llama_model * llama_load_model_from_hf(
+ const char * /*repo*/,
+ const char * /*model*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+ return nullptr;
+}
+
#endif // LLAMA_USE_CURL
std::tuple llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = nullptr;
- if (!params.model_url.empty()) {
+
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ } else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
+
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
@@ -1906,7 +2105,7 @@ std::tuple llama_init_from_gpt_par
}
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
- const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
int err = llama_model_apply_lora_from_file(model,
lora_adapter.c_str(),
diff --git a/common/common.h b/common/common.h
index 8dd8a3edc..99ee90bc3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,18 +88,22 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;
- std::string model = "models/7B/ggml-model-f16.gguf"; // model path
- std::string model_url = ""; // model url to download
- std::string model_draft = ""; // draft model for speculative decoding
- std::string model_alias = "unknown"; // model alias
- std::string prompt = "";
- std::string prompt_file = ""; // store the external prompt file name
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
- std::string input_prefix = ""; // string to prefix user inputs with
- std::string input_suffix = ""; // string to suffix user inputs with
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
+ std::string model_alias = "unknown"; // model alias
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
+ std::string prompt = "";
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
std::vector antiprompt; // string upon seeing which more user input is prompted
- std::string logdir = ""; // directory in which to save YAML log files
- std::string logits_file = ""; // file for saving *all* logits
+ std::string logdir = ""; // directory in which to save YAML log files
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+ std::string logits_file = ""; // file for saving *all* logits
std::vector kv_overrides;
@@ -139,7 +143,7 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
- bool cont_batching = false; // insert new sequences for decoding on-the-fly
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
@@ -167,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
@@ -192,8 +198,8 @@ std::tuple llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
- struct llama_model_params params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
// Batch utils
@@ -302,3 +308,10 @@ struct llama_control_vector_load_info {
// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector & load_infos);
+
+//
+// Split utils
+//
+static const char * const LLM_KV_SPLIT_NO = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
new file mode 100644
index 000000000..0e4680346
--- /dev/null
+++ b/common/json-schema-to-grammar.cpp
@@ -0,0 +1,721 @@
+#include "json-schema-to-grammar.h"
+#include
+#include
+#include