diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 758796632..d50af0b70 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ on:
- cron: '04 2 * * *'
concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
+ group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true
jobs:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ff7238aba..50f76db3c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,6 +32,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
@@ -52,7 +54,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Determine tag name
id: tag
@@ -88,6 +90,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
@@ -101,7 +105,9 @@ jobs:
sysctl -a
mkdir build
cd build
- cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+ # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -204,26 +210,28 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential
+ sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON
+ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -236,6 +244,33 @@ jobs:
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ - name: Determine tag name
+ id: tag
+ shell: bash
+ run: |
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+ else
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp LICENSE ./build/bin/
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+ name: llama-bin-ubuntu-x64.zip
+
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
@@ -938,6 +973,12 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
+ with:
+ path: ./artifact
+
+ - name: Move artifacts
+ id: move_artifacts
+ run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
@@ -956,7 +997,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
- for (let file of await fs.readdirSync('./artifact')) {
+ for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@@ -964,7 +1005,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
- data: await fs.readFileSync(`./artifact/${file}`)
+ data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index eefd87878..9b03d19bc 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -91,6 +91,12 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
+ - name: Downcase github.repository_owner
+ run: |
+ echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+ env:
+ GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -98,7 +104,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+ tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -107,5 +113,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+ tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 521cc29ae..3e68a3c8c 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -23,7 +23,7 @@ on:
- cron: '2 4 * * *'
concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+ group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true
jobs:
diff --git a/.gitignore b/.gitignore
index 9fb5b80c3..5c1490084 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ lcov-report/
gcovr-report/
build*
+!build.zig
cmake-build-*
out/
tmp/
@@ -48,6 +49,7 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
+/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
@@ -99,6 +101,9 @@ qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
poetry.lock
poetry.toml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19fdfa46c..f134a153b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,18 @@ else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
+# TODO: fix this for Android CI
+# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
+#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
+# set(LLAMA_LLAMAFILE_DEFAULT OFF)
+#else()
+# set(LLAMA_LLAMAFILE_DEFAULT ON)
+#endif()
+
+# TODO: temporary disable until MoE is fixed
+# https://github.com/ggerganov/llama.cpp/pull/6716
+set(LLAMA_LLAMAFILE_DEFAULT OFF)
+
# general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
@@ -88,6 +100,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
+option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
@@ -286,6 +299,7 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK}
)
endif()
+
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@@ -368,6 +382,13 @@ if (LLAMA_BLAS)
endif()
endif()
+if (LLAMA_LLAMAFILE)
+ add_compile_definitions(GGML_USE_LLAMAFILE)
+
+ set(GGML_HEADERS_LLAMAFILE sgemm.h)
+ set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+endif()
+
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
@@ -1151,15 +1172,16 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
- ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
- ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
- ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
- ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
- ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
- ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
- ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
- ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
- ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
+ ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+ ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
+ ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+ ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+ ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+ ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
+ ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
+ ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
+ ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
+ ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/Makefile b/Makefile
index 11b31c5c8..b0b2ea997 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -384,6 +384,15 @@ ifdef LLAMA_OPENBLAS
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS
+# TODO: temporary disable until MoE is fixed
+# https://github.com/ggerganov/llama.cpp/pull/6716
+LLAMA_NO_LLAMAFILE := 1
+
+ifndef LLAMA_NO_LLAMAFILE
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+ OBJS += sgemm.o
+endif
+
ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -480,11 +489,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
-
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -603,6 +610,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
+ifndef LLAMA_NO_LLAMAFILE
+sgemm.o: sgemm.cpp sgemm.h ggml.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
GF_CC := $(CC)
include scripts/get-flags.mk
@@ -646,7 +658,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -687,8 +699,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,7 +768,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -788,10 +800,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+ echo "unsigned char $${NAME}[] = {" && \
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+ echo "};" && \
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+ ) > $@
+
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -800,6 +821,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/Package.swift b/Package.swift
index 8b7195869..183e64757 100644
--- a/Package.swift
+++ b/Package.swift
@@ -2,6 +2,45 @@
import PackageDescription
+var sources = [
+ "ggml.c",
+ "sgemm.cpp",
+ "llama.cpp",
+ "unicode.cpp",
+ "unicode-data.cpp",
+ "ggml-alloc.c",
+ "ggml-backend.c",
+ "ggml-quants.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] = [
+ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+ .unsafeFlags(["-fno-objc-arc"]),
+ // NOTE: NEW_LAPACK will required iOS version 16.4+
+ // We should consider add this in the future when we drop support for iOS 14
+ // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+ // .define("ACCELERATE_NEW_LAPACK"),
+ // .define("ACCELERATE_LAPACK_ILP64")
+]
+
+#if canImport(Darwin)
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+ contentsOf: [
+ .define("GGML_USE_ACCELERATE"),
+ .define("GGML_USE_METAL")
+ ]
+)
+#endif
+
+#if os(Linux)
+ cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
let package = Package(
name: "llama",
platforms: [
@@ -28,34 +67,11 @@ let package = Package(
"ggml-cuda.h",
"Makefile"
],
- sources: [
- "ggml.c",
- "llama.cpp",
- "unicode.cpp",
- "unicode-data.cpp",
- "ggml-alloc.c",
- "ggml-backend.c",
- "ggml-quants.c",
- "ggml-metal.m",
- ],
- resources: [
- .process("ggml-metal.metal")
- ],
+ sources: sources,
+ resources: resources,
publicHeadersPath: "spm-headers",
- cSettings: [
- .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
- .define("GGML_USE_ACCELERATE"),
- .unsafeFlags(["-fno-objc-arc"]),
- .define("GGML_USE_METAL"),
- // NOTE: NEW_LAPACK will required iOS version 16.4+
- // We should consider add this in the future when we drop support for iOS 14
- // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
- // .define("ACCELERATE_NEW_LAPACK"),
- // .define("ACCELERATE_LAPACK_ILP64")
- ],
- linkerSettings: [
- .linkedFramework("Accelerate")
- ]
+ cSettings: cSettings,
+ linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
diff --git a/README-sycl.md b/README-sycl.md
index 4372a32e3..2aa465070 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -8,9 +8,9 @@
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
-- [Known Issue](#known-issue)
-- [Q&A](#q&a)
-- [Todo](#todo)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)
## Background
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
+| OS | Status | Verified |
+|---------|---------|------------------------------------|
+| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11 |
## Hardware
@@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
+| Intel GPU | Status | Verified Model |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series | Support | Max 1550, 1100 |
+| Intel Data Center Flex Series | Support | Flex 170 |
+| Intel Arc Series | Support | Arc 770, 730M |
+| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
+| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
@@ -84,24 +84,18 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
- **Execution Unit (EU)**
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
-### Nvidia GPU
-The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+### Other Vendor GPU
**Verified devices**
-|Nvidia GPU| Status | Verified Model|
-|-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
-
-*Notes:*
- - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
-
- - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
-
+| Nvidia GPU | Status | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series | Support | A100, A4000 |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
+
### Build image
```sh
# Using FP16
@@ -167,30 +161,11 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
-Installation can be verified by running the following:
-```sh
-nvidia-smi
-```
-Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
-```
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
-|-----------------------------------------+----------------------+----------------------+
-| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
-| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
-| | | MIG M. |
-|=========================================+======================+======================|
-| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
-| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
-| | | Disabled |
-+-----------------------------------------+----------------------+----------------------+
-```
-
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
2. **Install Intel® oneAPI Base toolkit**
-- **Base installation**
+- **For Intel GPU**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
@@ -202,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
- **Adding support to Nvidia GPUs**
-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
@@ -237,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
- **Nvidia GPU**
-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -254,12 +229,14 @@ source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
-# Option 1: Use FP16 for better performance in long-prompt inference
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-# Or without "--build", run "make" next
+# Option 1: Use FP16 for better performance in long-prompt inference
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build all binary
+cmake --build . --config Release -j -v
```
#### Nvidia GPU
@@ -274,10 +251,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build all binary
+cmake --build . --config Release -j -v
+
```
### III. Run the inference
@@ -313,10 +294,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
@@ -325,10 +306,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -357,7 +338,6 @@ Otherwise, you can run the script:
*Notes:*
-- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -438,7 +418,7 @@ cd build
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-make
+make -j
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -486,10 +466,10 @@ found 6 SYCL devices:
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
@@ -499,10 +479,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -525,7 +505,6 @@ Otherwise, run the following wrapper script:
Note:
-- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -540,29 +519,23 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+| Name | Value | Function |
+|--------------------|-----------------------------------|---------------------------------------------|
+| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
+| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
+| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
+| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
+| Name | Value | Function |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
## Known Issues
-- Hanging during startup
-
- llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
-
- - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
-
- `Split-mode:[row]` is not supported.
## Q&A
@@ -574,7 +547,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
- General compiler error:
- - Remove build folder or try a clean-build.
+ - Remove **build** folder or try a clean-build.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
@@ -591,6 +564,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
-## Todo
+## TODO
- Support row layer split for multiple card runs.
diff --git a/README.md b/README.md
index a4897fc36..1d4e9d417 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes
+- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@@ -94,7 +95,8 @@ Typically finetunes of the base models below are supported as well.
- [x] LLaMA 2 🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
-- [X] Falcon
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -121,6 +123,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)
+
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
**Multimodal models:**
@@ -185,6 +190,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@@ -483,20 +490,20 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
- | Option | Legal values | Default | Description |
- |--------------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
- | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+ | Option | Legal values | Default | Description |
+ |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
- You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+ You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
@@ -513,7 +520,7 @@ Building the program with BLAS support may lead to some performance improvements
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
- make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
+ make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
@@ -532,18 +539,18 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
- | Option | Legal values | Default | Description |
- |-------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | Option | Legal values | Default | Description |
+ |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+ - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
@@ -568,6 +575,12 @@ Building the program with BLAS support may lead to some performance improvements
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
+ Linux packaging:
+ Fedora Linux:
+ ```bash
+ sudo dnf install clblast
+ ```
+
Alternatively, they may be built from source.
-
@@ -744,11 +757,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-| 7B | 13 GB | 3.9 GB |
-| 13B | 24 GB | 7.8 GB |
-| 30B | 60 GB | 19.5 GB |
-| 65B | 120 GB | 38.5 GB |
+|------:|--------------:|----------------------:|
+| 7B | 13 GB | 3.9 GB |
+| 13B | 24 GB | 7.8 GB |
+| 30B | 60 GB | 19.5 GB |
+| 65B | 120 GB | 38.5 GB |
### Quantization
@@ -756,7 +769,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
-| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
+| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
diff --git a/SECURITY.md b/SECURITY.md
index 14504b1bf..f4322c6ee 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
diff --git a/build.zig b/build.zig
index 7f36e5968..96783574f 100644
--- a/build.zig
+++ b/build.zig
@@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
+ const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -128,15 +129,44 @@ pub fn build(b: *std.build.Builder) !void {
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
- _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
- _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
- _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
- _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+ _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+ _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+ _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+ _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+ const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
+
+ const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+ for (server_assets) |asset| {
+ const input_path = b.fmt("examples/server/public/{s}", .{asset});
+ const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+ // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+ const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+ defer b.allocator.free(input);
+
+ var buf = std.ArrayList(u8).init(b.allocator);
+ defer buf.deinit();
+
+ for (input) |byte| {
+ try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+ }
+
+ var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+ defer b.allocator.free(name);
+ std.mem.replaceScalar(u8, name, '.', '_');
+
+ try std.fs.cwd().writeFile(output_path, b.fmt(
+ "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+ .{ name, buf.items, name, input.len },
+ ));
+
+ std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+ }
}
diff --git a/ci/run.sh b/ci/run.sh
index 19776b5f7..085dfd42f 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -153,6 +153,52 @@ function gg_sum_ctest_release {
gg_printf '```\n'
}
+# test_scripts_debug
+
+function gg_run_test_scripts_debug {
+ cd ${SRC}
+
+ set -e
+
+ (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+ set +e
+}
+
+function gg_sum_test_scripts_debug {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs test scripts in debug mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+ gg_printf '```\n'
+ gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
+ cd ${SRC}
+
+ set -e
+
+ (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+ set +e
+}
+
+function gg_sum_test_scripts_release {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs test scripts in release mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+ gg_printf '```\n'
+ gg_printf '\n'
+}
+
function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -642,6 +688,9 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
+ test $ret -eq 0 && gg_run test_scripts_debug
+ test $ret -eq 0 && gg_run test_scripts_release
+
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 1d840e5f7..0ec8d6d8d 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
-set(TARGET json-schema-to-grammar)
-add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
-
set(TARGET common)
add_library(${TARGET} STATIC
@@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
grammar-parser.h
grammar-parser.cpp
json.hpp
+ json-schema-to-grammar.cpp
train.h
train.cpp
ngram-cache.h
diff --git a/common/common.cpp b/common/common.cpp
index 7d983a453..06f252ea6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,6 @@
#include "common.h"
+#include "json.hpp"
+#include "json-schema-to-grammar.h"
#include "llama.h"
#include
@@ -68,6 +70,8 @@
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL
+using json = nlohmann::ordered_json;
+
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -104,6 +108,79 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+ __asm__("movq\t%%rbx,%%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx,%%rsi"
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+ : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+ cpu_set_t mask;
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+ unsigned eax, ebx, ecx, edx;
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+ return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+ unsigned eax, ebx, ecx, edx;
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+ int intel_atom = 0x20;
+ int core_type = (eax & 0xff000000u) >> 24;
+ return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+ int result = 0;
+ for (int cpu = 0; cpu < cpu_count; ++cpu) {
+ if (pin_cpu(cpu)) {
+ return -1;
+ }
+ if (is_running_on_efficiency_core()) {
+ continue; // efficiency cores harm lockstep threading
+ }
+ ++cpu; // hyperthreading isn't useful for linear algebra
+ ++result;
+ }
+ return result;
+}
+
+#endif // __x86_64__ && __linux__
+
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int get_math_cpu_count() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+ int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
+ if (cpu_count < 1) {
+ return get_num_physical_cores();
+ }
+ if (is_hybrid_cpu()) {
+ cpu_set_t affinity;
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+ int result = count_math_cpus(cpu_count);
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+ if (result > 0) {
+ return result;
+ }
+ }
+ }
+#endif
+ return get_num_physical_cores();
+}
+
void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -1148,6 +1225,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
);
return true;
}
+ if (arg == "-j" || arg == "--json-schema") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
+ return true;
+ }
if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
@@ -1353,6 +1438,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
printf(" --grammar-file FNAME file to read grammar from\n");
+ printf(" -j SCHEMA, --json-schema SCHEMA\n");
+ printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
+ printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
printf(" --cfg-negative-prompt PROMPT\n");
printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n");
@@ -1745,6 +1833,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
+ cparams.cb_eval = params.cb_eval;
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2192,7 +2282,7 @@ std::tuple llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
- {
+ if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
@@ -2212,23 +2302,23 @@ std::tuple llama_init_from_gpt_par
std::vector llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
- bool add_bos,
- bool special) {
- return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
+ bool add_special,
+ bool parse_special) {
+ return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
}
std::vector llama_tokenize(
const struct llama_model * model,
const std::string & text,
- bool add_bos,
- bool special) {
+ bool add_special,
+ bool parse_special) {
// upper limit for the number of tokens
- int n_tokens = text.length() + add_bos;
+ int n_tokens = text.length() + 2 * add_special;
std::vector result(n_tokens);
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+ n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) {
result.resize(-n_tokens);
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+ int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
@@ -2238,10 +2328,10 @@ std::vector llama_tokenize(
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector result(8, 0);
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
if (n_tokens < 0) {
result.resize(-n_tokens);
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index 4635e05d6..cca44268e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
+int get_math_cpu_count();
int32_t get_num_physical_cores();
//
@@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
- int32_t n_threads = get_num_physical_cores();
+ int32_t n_threads = get_math_cpu_count();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
@@ -80,6 +81,9 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
+ void * cb_eval_user_data = nullptr;
+
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@@ -156,6 +160,7 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
+ bool warmup = true; // warmup run
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
@@ -223,14 +228,14 @@ void llama_batch_add(
std::vector llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
- bool add_bos,
- bool special = false);
+ bool add_special,
+ bool parse_special = false);
std::vector llama_tokenize(
const struct llama_model * model,
const std::string & text,
- bool add_bos,
- bool special = false);
+ bool add_special,
+ bool parse_special = false);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 0e4680346..0f8f1b1d4 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,35 +11,101 @@
using json = nlohmann::ordered_json;
+template
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+ if (separator_rule.empty()) {
+ if (min_items == 0 && max_items == 1) {
+ return item_rule + "?";
+ } else if (min_items == 1 && max_items == std::numeric_limits::max()) {
+ return item_rule + "+";
+ }
+ }
+
+ std::string result;
+ if (min_items > 0) {
+ if (item_rule_is_literal && separator_rule.empty()) {
+ result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+ } else {
+ std::vector items(min_items, item_rule);
+ result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+ }
+ }
+
+ std::function opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+ auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+ if (up_to_n == 0) {
+ return "";
+ } else if (up_to_n == 1) {
+ return "(" + content + ")?";
+ } else if (!separator_rule.empty() && !prefix_with_sep) {
+ return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+ } else {
+ std::string res = repeat("(" + content + " ", up_to_n);
+ // strip trailing space
+ res = res.substr(0, res.length() - 1);
+ res += repeat(")?", up_to_n);
+ return res;
+ }
+ };
+
+ if (min_items > 0 && max_items != min_items) {
+ result += " ";
+ }
+
+ if (max_items != std::numeric_limits::max()) {
+ result += opt_repetitions(max_items - min_items, min_items > 0);
+ } else {
+ std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+ if (min_items == 0 && !separator_rule.empty()) {
+ result = "(" + item_rule + " " + item_operator + "*)?";
+ } else {
+ result += item_operator + "*";
+ }
+ }
+
+ return result;
+}
+
const std::string SPACE_RULE = "\" \"?";
-std::unordered_map PRIMITIVE_RULES = {
- {"boolean", "(\"true\" | \"false\") space"},
- {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
- {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
- {"value", "object | array | string | number | boolean"},
- {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
- {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
- {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
- {"string", " \"\\\"\" (\n"
- " [^\"\\\\] |\n"
- " \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
- " )* \"\\\"\" space"},
- {"null", "\"null\" space"}
+struct BuiltinRule {
+ std::string content;
+ std::vector deps;
};
-std::vector OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
-std::unordered_map DATE_RULES = {
- {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
- {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
- {"date-time", "date \"T\" time"},
- {"date-string", "\"\\\"\" date \"\\\"\" space"},
- {"time-string", "\"\\\"\" time \"\\\"\" space"},
- {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
+std::unordered_map PRIMITIVE_RULES = {
+ {"boolean", {"(\"true\" | \"false\") space", {}}},
+ {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+ {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+ {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+ {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+ {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+ {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+ {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+ {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+ "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+ {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+ {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+ {"null", {"\"null\" space", {}}},
+};
+
+std::unordered_map STRING_FORMAT_RULES = {
+ {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+ {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+ {"date-time", {"date \"T\" time", {"date", "time"}}},
+ {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+ {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+ {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
};
static bool is_reserved_name(const std::string & name) {
@@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
- for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+ for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -192,7 +258,7 @@ private:
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
- rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+ rule = "[^\\x0A\\x0D]";
}
return _add_rule("dot", rule);
};
@@ -308,47 +374,21 @@ private:
auto &sub = last.first;
auto sub_is_literal = last.second;
- if (min_times == 0 && max_times == std::numeric_limits::max()) {
- sub += "*";
- } else if (min_times == 0 && max_times == 1) {
- sub += "?";
- } else if (min_times == 1 && max_times == std::numeric_limits::max()) {
- sub += "+";
- } else {
- if (!sub_is_literal) {
- std::string & sub_id = sub_rule_ids[sub];
- if (sub_id.empty()) {
- sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
- }
- sub = sub_id;
+ if (!sub_is_literal) {
+ std::string & sub_id = sub_rule_ids[sub];
+ if (sub_id.empty()) {
+ sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
- std::string result;
- if (sub_is_literal && min_times > 0) {
- result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
- } else {
- for (int j = 0; j < min_times; j++) {
- if (j > 0) {
- result += " ";
- }
- result += sub;
- }
- }
- if (min_times > 0 && min_times < max_times) {
- result += " ";
- }
- if (max_times == std::numeric_limits::max()) {
- result += sub + "*";
- } else {
- for (int j = min_times; j < max_times; j++) {
- if (j > min_times) {
- result += " ";
- }
- result += sub + "?";
- }
- }
- seq.back().first = result;
- seq.back().second = false;
+ sub = sub_id;
}
+ seq.back().first = build_repetition(
+ sub_is_literal ? "\"" + sub + "\"" : sub,
+ min_times,
+ max_times,
+ "",
+ sub_is_literal
+ );
+ seq.back().second = false;
} else {
std::string literal;
auto is_non_literal = [&](char c) {
@@ -424,7 +464,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
- std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+ std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -486,6 +526,25 @@ private:
return rule;
}
+ std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+ auto n = _add_rule(name, rule.content);
+ for (const auto & dep : rule.deps) {
+ BuiltinRule dep_rule;
+ auto it = PRIMITIVE_RULES.find(dep);
+ if (it == PRIMITIVE_RULES.end()) {
+ it = STRING_FORMAT_RULES.find(dep);
+ if (it == STRING_FORMAT_RULES.end()) {
+ _errors.push_back("Rule " + dep + " not known");
+ continue;
+ }
+ }
+ if (_rules.find(dep) == _rules.end()) {
+ _add_primitive(dep, it->second);
+ }
+ }
+ return n;
+ }
+
public:
SchemaConverter(
const std::function & fetch_json,
@@ -647,49 +706,33 @@ public:
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
- std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
- std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
- int max_items = max_items_json.is_number_integer() ? max_items_json.get() : -1;
- if (min_items > 0) {
- successive_items += repeat(list_item_operator, min_items - 1);
- min_items--;
- }
- if (max_items >= 0 && max_items > min_items) {
- successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
- } else {
- successive_items += list_item_operator + "*";
- }
- std::string rule;
- if (min_items == 0) {
- rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
- } else {
- rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
- }
- return _add_rule(rule_name, rule);
+ int max_items = max_items_json.is_number_integer() ? max_items_json.get() : std::numeric_limits::max();
+
+ return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
- return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
- } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
- for (const auto & kv : DATE_RULES) {
- _add_rule(kv.first, kv.second);
- }
- return schema_format + "-string";
+ return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+ } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+ auto prim_name = schema_format + "-string";
+ return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+ } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+ std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+ int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0;
+ int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max();
+ return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema.empty() || schema_type == "object") {
- for (const auto & n : OBJECT_RULE_NAMES) {
- _add_rule(n, PRIMITIVE_RULES.at(n));
- }
- return _add_rule(rule_name, "object");
+ return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
- return _add_rule(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get()));
+ return _add_primitive(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get()));
}
}
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 2cce4c2de..f4a758aaa 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -43,18 +43,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
- def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+ self.use_temp_file = use_temp_file
self.is_safetensors = self._is_model_safetensors()
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
- self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess,
- use_temp_file=False)
+ self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@property
@@ -232,15 +232,14 @@ class Model(ABC):
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
- def _set_vocab_gpt2(self):
- dir_model = self.dir_model
- hparams = self.hparams
+ # used for GPT-2 BPE and WordPiece vocabs
+ def get_basic_vocab(self) -> tuple[list[str], list[int]]:
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained(dir_model)
- vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+ vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
@@ -260,11 +259,15 @@ class Model(ABC):
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
+ return tokens, toktypes
+
+ def _set_vocab_gpt2(self) -> None:
+ tokens, toktypes = self.get_basic_vocab()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_qwen(self):
@@ -1209,10 +1212,91 @@ class StableLMModel(Model):
self.gguf_writer.add_rope_dimension_count(
int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
- self.gguf_writer.add_parallel_residual(
- hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+ self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_head = self.hparams.get("num_attention_heads")
+ n_kv_head = self.hparams.get("num_key_value_heads")
+ q_norms = dict()
+ k_norms = dict()
+ for name, data_torch in self.get_tensors():
+ # we don't need these
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+ n_dims = len(data.shape)
+ if name.find("q_layernorm.norms") != -1:
+ q_norms[name] = data
+ if len(q_norms) >= (block_count * n_head):
+ self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
+ continue
+ if name.find("k_layernorm.norms") != -1:
+ k_norms[name] = data
+ if len(k_norms) >= (block_count * n_kv_head):
+ self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
+ continue
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
+ for bid in range(block_count):
+ datas = []
+ for xid in range(n_head):
+ ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
+ datas.append(norms[ename])
+ del norms[ename]
+ data = np.stack(datas, axis=0)
+ data_dtype = data.dtype
+ merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class LlamaModel(Model):
@@ -1222,7 +1306,23 @@ class LlamaModel(Model):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
- self._set_vocab_llama_hf()
+ try:
+ self._set_vocab_llama_hf()
+ except (FileNotFoundError, TypeError):
+ # Llama 3
+ self._set_vocab_gpt2()
+
+ # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+ if self.hparams.get("vocab_size", 32000) == 32016:
+ special_vocab = gguf.SpecialVocab(
+ self.dir_model, load_merges=False,
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+ )
+ special_vocab._set_special_token("prefix", 32007)
+ special_vocab._set_special_token("suffix", 32008)
+ special_vocab._set_special_token("middle", 32009)
+ special_vocab._set_special_token("eot", 32010)
+ special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -1431,6 +1531,102 @@ class GrokModel(Model):
self.gguf_writer.add_tensor(new_name, data)
+@Model.register("DbrxForCausalLM")
+class DbrxModel(Model):
+ model_arch = gguf.MODEL_ARCH.DBRX
+
+ def set_gguf_parameters(self):
+ ffn_config = self.hparams["ffn_config"]
+ attn_config = self.hparams["attn_config"]
+ self.gguf_writer.add_name(self.hparams["model_type"])
+ self.gguf_writer.add_block_count(self.hparams["n_layers"])
+
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+ self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
+
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
+ self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
+
+ self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
+
+ self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
+ self.gguf_writer.add_file_type(self.ftype)
+
+ self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
+ self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
+
+ self.gguf_writer.add_layer_norm_eps(1e-5)
+
+ self.gguf_writer.add_file_type(self.ftype)
+ print(f"gguf: file type = {self.ftype}")
+
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers")
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ for name, data_torch in self.get_tensors():
+ n_expert = self.hparams["ffn_config"]["moe_num_experts"]
+ n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
+ n_embd = self.hparams["d_model"]
+
+ # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+ # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+ # But llama.cpp moe graph works differently
+ # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+ # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
+ exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
+ "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
+ "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
+ experts = False
+ for exp_tensor_name in exp_tensor_names.keys():
+ if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
+ experts = True
+ data_torch = data_torch.view(n_expert, n_ff, n_embd)
+ if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
+ data_torch = data_torch.permute(*permute_tensor)
+ break
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # map tensor names
+ # In MoE models the ffn tensors are typically most of the model weights,
+ # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+ # Every other model has the weight names ending in .weight,
+ # let's assume that is the convention which is not the case for dbrx:
+ # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
+ new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # Most of the codebase that takes in 1D tensors only handles F32 tensors
+ # and most of the outputs tensors are F32.
+ if data_dtype != np.float32 and n_dims == 1:
+ print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
+ sys.exit()
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+
@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1599,6 +1795,105 @@ class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2
+@Model.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(Model):
+ model_arch = gguf.MODEL_ARCH.QWEN2MOE
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ if (n_experts := self.hparams.get("num_experts")) is not None:
+ self.gguf_writer.add_expert_count(n_experts)
+
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_experts = self.hparams.get("num_experts")
+ experts = dict()
+ for name, data_torch in self.get_tensors():
+ # we don't need these
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # process the experts separately
+ if name.find("experts") != -1:
+ experts[name] = data
+ if len(experts) >= n_experts * 3:
+ # merge the experts into a single 3d tensor
+ for bid in range(block_count):
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
+ full = True
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ if ename not in experts:
+ full = False
+ break
+ if not full:
+ continue
+
+ datas = []
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ datas.append(experts[ename])
+ del experts[ename]
+
+ data = np.stack(datas, axis=0)
+ data_dtype = data.dtype
+
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ if self.ftype == 1 and data_dtype == np.float32:
+ data = data.astype(np.float16)
+
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+ continue
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts.keys()}")
+
+
@Model.register("GPT2LMHeadModel")
class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
@@ -1913,6 +2208,8 @@ class InternLM2Model(Model):
old_eos = special_vocab.special_token_ids["eos"]
if "chat" in os.path.basename(self.dir_model.absolute()):
# For the chat model, we replace the eos with '<|im_end|>'.
+ # TODO: this is a hack, should be fixed
+ # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
in chat mode so that the conversation can end normally.")
@@ -2052,35 +2349,25 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
- # use huggingface vocab to get all tokens
- vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
- tokens, scores, toktypes = zip(*vocab.all_tokens())
- assert len(tokens) == vocab.vocab_size
- self.vocab_size = vocab.vocab_size
+ tokens, toktypes = self.get_basic_vocab()
+ self.vocab_size = len(tokens)
# we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings
- n_token_types = len(set(toktypes))
- self.gguf_writer.add_token_type_count(n_token_types)
+ self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
# convert to phantom space vocab
- def phantom(tok, typ):
- if tok.startswith(b"[") and tok.endswith(b"]"):
+ def phantom(tok):
+ if tok.startswith("[") and tok.endswith("]"):
return tok
- if tok.startswith(b"##"):
+ if tok.startswith("##"):
return tok[2:]
- return b"\xe2\x96\x81" + tok
-
- tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
-
- # set up bos and eos tokens (cls and sep)
- self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
- self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
+ return "\u2581" + tok
+ tokens = list(map(phantom, tokens))
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
# handle special tokens
@@ -2153,16 +2440,6 @@ class NomicBertModel(BertModel):
super().set_gguf_parameters()
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
- def get_tensors(self):
- assert self.vocab_size is not None
- for name, data in super().get_tensors():
- # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
- if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
- rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
- assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
- data = data[:self.vocab_size, :]
- yield name, data
-
@Model.register("JinaBertModel")
class JinaBertModel(BertModel):
@@ -2196,6 +2473,16 @@ class GemmaModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
+ # TODO: these special tokens should be exported only for the CodeGemma family
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+ special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+ special_vocab._set_special_token("prefix", 67)
+ special_vocab._set_special_token("suffix", 69)
+ special_vocab._set_special_token("middle", 68)
+ special_vocab._set_special_token("fsep", 70)
+ special_vocab._set_special_token("eot", 107)
+ special_vocab.add_to_gguf(self.gguf_writer)
+
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
@@ -2218,6 +2505,12 @@ class GemmaModel(Model):
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+ # To prevent errors, skip loading lm_head.weight.
+ if name == "lm_head.weight":
+ print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+ continue
+
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
@@ -2277,28 +2570,34 @@ class MambaModel(Model):
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
def set_gguf_parameters(self):
- d_model = self.find_hparam(["hidden_size", "d_model"])
- d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+ d_model = self.find_hparam(["hidden_size", "d_model"])
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
- d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
# ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
- dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+ dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2
@@ -2364,8 +2663,8 @@ class MambaModel(Model):
data = data.astype(np.float32)
# if f16 desired, convert big float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith(
- (".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+ new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
+ if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
@@ -2390,6 +2689,66 @@ class CommandR2Model(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+@Model.register("OlmoForCausalLM")
+@Model.register("OLMoForCausalLM")
+class OlmoModel(Model):
+ model_arch = gguf.MODEL_ARCH.OLMO
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ self.gguf_writer.add_layer_norm_eps(1e-5)
+ if "clip_qkv" in self.hparams is not None:
+ self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
+
+ # Same as super class, but permuting q_proj, k_proj
+ # Copied from: LlamaModel
+ def write_tensors(self):
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ n_head = self.hparams.get("num_attention_heads")
+ n_kv_head = self.hparams.get("num_key_value_heads")
+ for name, data_torch in self.get_tensors():
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.numpy()
+
+ if name.endswith("q_proj.weight"):
+ data = permute(data, n_head, n_head)
+ if name.endswith("k_proj.weight"):
+ data = permute(data, n_head, n_kv_head)
+
+ data = data.squeeze()
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # 1d tensors need to be converted to float32
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+
###### CONVERSION LOGIC ######
@@ -2416,6 +2775,7 @@ def parse_args() -> argparse.Namespace:
"model", type=Path,
help="directory containing model file",
)
+ parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
return parser.parse_args()
@@ -2458,7 +2818,7 @@ def main() -> None:
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+ model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
print("Set model parameters")
model_instance.set_gguf_parameters()
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
index ccb99279e..69be17f94 100755
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,4 +1,6 @@
#!/usr/bin/env python3
+from __future__ import annotations
+
import argparse
import os
import sys
diff --git a/convert.py b/convert.py
index a37aeb5e5..1c700cf6a 100755
--- a/convert.py
+++ b/convert.py
@@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
if TYPE_CHECKING:
- from typing import TypeAlias
+ from typing_extensions import Self, TypeAlias
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
@@ -517,7 +517,7 @@ class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
- def __init__(self, base_path: Path, ignore_nonllama: bool = False):
+ def __init__(self, base_path: Path):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
@@ -525,9 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if ignore_nonllama:
- pass # workaround incorrect use of this class for WordPiece
- elif (
+ is_llama3 = (
+ tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+ and not tokenizer_model.get('byte_fallback', True)
+ )
+ if is_llama3:
+ raise TypeError('Llama 3 must be converted with BpeVocab')
+
+ if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
@@ -647,16 +652,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
class Tensor(ABC):
+ ndarray: NDArray
data_type: DataType
@abstractmethod
- def astype(self, data_type: DataType) -> Tensor: ...
+ def astype(self, data_type: DataType) -> Self: ...
@abstractmethod
- def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
+ def permute(self, n_head: int, n_head_kv: int) -> Self: ...
@abstractmethod
- def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
@abstractmethod
- def part(self, n_part: int) -> UnquantizedTensor: ...
+ def part(self, n_part: int) -> Self: ...
@abstractmethod
def to_ggml(self) -> GGMLCompatibleTensor: ...
@@ -673,13 +679,13 @@ class UnquantizedTensor(Tensor):
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
- def astype(self, data_type: DataType) -> Tensor:
+ def astype(self, data_type: DataType) -> UnquantizedTensor:
dtype = data_type.dtype
if self.data_type == DT_BF16:
self.ndarray = bf16_to_fp32(self.ndarray)
return UnquantizedTensor(self.ndarray.astype(dtype))
- def to_ggml(self) -> UnquantizedTensor:
+ def to_ggml(self) -> Self:
return self
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -1351,7 +1357,7 @@ def load_some_model(path: Path) -> ModelPlus:
# Be extra-friendly and accept either a file or a directory:
if path.is_dir():
# Check if it's a set of safetensors files first
- globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+ globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try the PyTorch patterns too, with lower priority
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
new file mode 100644
index 000000000..a56b78344
--- /dev/null
+++ b/docs/HOWTO-add-model.md
@@ -0,0 +1,119 @@
+## Add a new model architecture to `llama.cpp`
+
+Adding a model requires few steps:
+
+1. Convert the model to GGUF
+2. Define the model architecture in `llama.cpp`
+3. Build the GGML graph implementation
+
+After following these steps, you can open PR.
+
+Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)
+
+### 1. Convert the model to GGUF
+
+This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+
+The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
+
+The required steps to implement for an HF model are:
+
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+
+```python
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
+ model_arch = gguf.MODEL_ARCH.GROK
+```
+
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+
+Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
+
+Example for `falcon` model:
+```python
+ MODEL_ARCH.FALCON: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_NORM_2,
+ MODEL_TENSOR.ATTN_QKV,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ]
+```
+
+3. Map the original tensor names to the standardize equivalent in GGUF
+
+As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
+
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+
+If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
+
+Example for the normalization tensor in attention layers:
+
+```python
+block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+ # Attention norm
+ MODEL_TENSOR.ATTN_NORM: (
+ "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+ "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
+ "transformer.blocks.{bid}.norm_1", # mpt
+ ...
+ )
+}
+```
+
+`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
+
+Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`
+
+NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
+
+### 2. Define the model architecture in `llama.cpp`
+
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+
+NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
+
+### 3. Build the GGML graph implementation
+
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+
+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+
+## GGUF specification
+
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+
+## Resources
+
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 76496bf06..f421769cc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
+ add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index d75c503d5..dbbd06da5 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
- if new_token_id == llama_token_eos(model) || n_cur == n_len {
+ if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
- let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+ let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
@@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
model,
token,
&result,
- Int32(result.count)
+ Int32(result.count),
+ false
)
assert(check == actualTokensCount)
} else {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 7aaf63ceb..be30d20bf 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
- // is it an end of stream? -> mark the stream as finished
- if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+ // is it an end of generation? -> mark the stream as finished
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 866c6d7a6..3d34378a5 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+ return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 536657526..6a93147d7 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
inputs.push_back(inp);
}
- // add eos if not present
+ // add SEP if not present
for (auto & inp : inputs) {
- if (inp.empty() || inp.back() != llama_token_eos(model)) {
- inp.push_back(llama_token_eos(model));
+ if (inp.empty() || inp.back() != llama_token_sep(model)) {
+ inp.push_back(llama_token_sep(model));
}
}
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
new file mode 100644
index 000000000..c56ba780b
--- /dev/null
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
new file mode 100644
index 000000000..66a37e878
--- /dev/null
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+eval-callback \
+ --hf-repo ggml-org/models \
+ --hf-file phi-2/ggml-model-q4_0.gguf \
+ --model phi-2-q4_0.gguf \
+ --prompt hello \
+ --seed 42 \
+ -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx = 512
+...
+llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
+llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
+llama_new_context_with_model: graph nodes = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.0181, 0.0272, 0.0272, ...],
+ ],
+ ]
+ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.6989, 1.0636, 1.0636, ...],
+ ],
+ ]
+ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1800, 0.2817, 0.2632, ...],
+ ],
+ ]
+ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1863, 0.2970, 0.2604, ...],
+ ],
+ ]
+ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1238, 1.2876, -1.8086, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+```
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
new file mode 100644
index 000000000..29b5f3b3c
--- /dev/null
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include
+#include
+#include
+#include
+#include
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+ std::vector data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+ std::string str;
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ str += std::to_string(t->ne[i]);
+ if (i + 1 < GGML_MAX_DIMS) {
+ str += ", ";
+ }
+ }
+ return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+ GGML_ASSERT(n > 0);
+ float sum = 0;
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ printf(" [\n");
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+ if (i2 == n && ne[2] > 2*n) {
+ printf(" ..., \n");
+ i2 = ne[2] - n;
+ }
+ printf(" [\n");
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+ if (i1 == n && ne[1] > 2*n) {
+ printf(" ..., \n");
+ i1 = ne[1] - n;
+ }
+ printf(" [");
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+ if (i0 == n && ne[0] > 2*n) {
+ printf("..., ");
+ i0 = ne[0] - n;
+ }
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+ float v;
+ if (type == GGML_TYPE_F16) {
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+ } else if (type == GGML_TYPE_F32) {
+ v = *(float *) data + i;
+ } else if (type == GGML_TYPE_I32) {
+ v = (float) *(int32_t *) data + i;
+ } else if (type == GGML_TYPE_I16) {
+ v = (float) *(int16_t *) data + i;
+ } else if (type == GGML_TYPE_I8) {
+ v = (float) *(int8_t *) data + i;
+ } else {
+ GGML_ASSERT(false);
+ }
+ printf("%12.4f", v);
+ sum += v;
+ if (i0 < ne[0] - 1) printf(", ");
+ }
+ printf("],\n");
+ }
+ printf(" ],\n");
+ }
+ printf(" ]\n");
+ printf(" sum = %f\n", sum);
+ }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ * see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+ auto * cb_data = (callback_data *) user_data;
+
+ const struct ggml_tensor * src0 = t->src[0];
+ const struct ggml_tensor * src1 = t->src[1];
+
+ if (ask) {
+ return true; // Always retrieve data
+ }
+
+ char src1_str[128] = {0};
+ if (src1) {
+ sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+ }
+
+ printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
+ src0->name, ggml_ne_string(src0).c_str(),
+ src1 ? src1_str : "",
+ ggml_ne_string(t).c_str());
+
+
+ // copy the data from the GPU memory if needed
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+ if (!is_host) {
+ auto n_bytes = ggml_nbytes(t);
+ cb_data->data.resize(n_bytes);
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+ }
+
+ if (!ggml_is_quantized(t->type)) {
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+ ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+ }
+
+ return true;
+}
+
+static bool run(llama_context * ctx, const gpt_params & params) {
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+ std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return false;
+ }
+
+ return true;
+}
+
+int main(int argc, char ** argv) {
+
+ callback_data cb_data;
+
+ gpt_params params;
+ if (!gpt_params_parse(argc, argv, params)) {
+ return 1;
+ }
+
+ print_build_info();
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_backend_init();
+ llama_numa_init(params.numa);
+
+ // pass the callback to the backend scheduler
+ // it will be executed for each node during the graph computation
+ params.cb_eval = ggml_debug;
+ params.cb_eval_user_data = &cb_data;
+ params.warmup = false;
+
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
+ return 1;
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ }
+
+ bool OK = run(ctx, params);
+ if (!OK) {
+ return 1;
+ }
+
+ llama_print_timings(ctx);
+
+ llama_free(ctx);
+ llama_free_model(model);
+
+ llama_backend_free();
+
+ return 0;
+}
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index e4c0c1689..091069ffa 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md
index ddb1f7649..ad1d86651 100644
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 24acbf02a..39c75e0a7 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = n * 1024 * 1024; // megabytes
+ n_bytes = (size_t)n * 1024 * 1024; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+ n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
new file mode 100644
index 000000000..879522f7e
--- /dev/null
+++ b/examples/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+ echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
+ echo "example: $0 ../../build/bin ../../tmp"
+ exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+ TMP_DIR=$2
+else
+ TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/gguf-split
+CUR_DIR=$(pwd)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+ cd $WORK_PATH
+ "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensor in metadata
+#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+#echo PASS
+#echo
+
+# 4b. Test the sharded model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 5444503a5..575143771 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
}
// read and create ggml_context containing the tensors and their data
-static bool gguf_ex_read_1(const std::string & fname) {
+static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
printf("\n\n");
// check data
- {
+ if (check_data) {
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
@@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) {
if (argc < 3) {
- printf("usage: %s data.gguf r|w\n", argv[0]);
+ printf("usage: %s data.gguf r|w [n]\n", argv[0]);
+ printf("r: read data.gguf file\n");
+ printf("w: write data.gguf file\n");
+ printf("n: no check of tensor data\n");
return -1;
}
+ bool check_data = true;
+ if (argc == 4) {
+ check_data = false;
+ }
const std::string fname(argv[1]);
const std::string mode (argv[2]);
@@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
- GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+ GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
}
return 0;
diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
index 64cc19204..a3a3c1389 100644
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -21,12 +21,12 @@ not have to be performed at all.
### Running the example
Download a Grit model:
```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
```
Run the example using the downloaded model:
```console
-$ ./gritlm -m gritlm-7b_q4_1.gguf
+$ ./gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index d8cb0a642..98c0e93e4 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -44,7 +44,7 @@ private:
std::mutex m_mutex;
int m_last_call = 0;
std::vector m_src1_data;
- std::vector m_ids; // the expert ids from ggml_mul_mat_id
+ std::vector m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name) const;
void keep_imatrix(int ncall) const;
@@ -81,6 +81,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (ask) {
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
+ // why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
return true;
@@ -101,16 +102,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
- const int idx = ((int32_t *) t->op_params)[0];
+ // ids -> [n_experts_used, n_tokens]
+ // src1 -> [cols, n_expert_used, n_tokens]
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
+ const int n_ids = ids->ne[0];
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
- GGML_ASSERT(ids->ne[1] == src1->ne[1]);
- GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
- m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+
+ GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+ m_ids.resize(ggml_nbytes(ids));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
@@ -120,26 +124,35 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+ if (e.values.empty()) {
+ e.values.resize(src1->ne[0]*n_as, 0);
+ }
+ else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+ fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+ exit(1); //GGML_ASSERT(false);
+ }
+ if (m_params.verbosity > 1) {
+ printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+ }
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
- if (e.values.empty()) {
- e.values.resize(src1->ne[0]*n_as, 0);
- }
- else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
- exit(1); //GGML_ASSERT(false);
- }
- if (m_params.verbosity > 1) {
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
- }
- for (int row = 0; row < (int)src1->ne[1]; ++row) {
- const int excur = m_ids[row*n_as + idx];
- GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
- if (excur != ex) continue;
- const float * x = data + row * src1->ne[0];
- for (int j = 0; j < (int)src1->ne[0]; ++j) {
- e.values[e_start + j] += x[j]*x[j];
+
+ for (int idx = 0; idx < n_ids; ++idx) {
+ for (int row = 0; row < (int)src1->ne[2]; ++row) {
+ const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+ GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+ if (excur != ex) continue;
+
+ const int64_t i11 = idx % src1->ne[1];
+ const int64_t i12 = row;
+ const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+ for (int j = 0; j < (int)src1->ne[0]; ++j) {
+ e.values[e_start + j] += x[j]*x[j];
+ }
}
}
if (e.ncall > m_last_call) {
@@ -349,12 +362,13 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+ GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
- std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+ std::vector tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count());
@@ -596,24 +610,18 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
- llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
- llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
- if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
- return 1;
- }
-
- llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
- cparams.cb_eval = ik_collect_imatrix;
- cparams.cb_eval_user_data = NULL;
+ params.cb_eval = ik_collect_imatrix;
+ params.cb_eval_user_data = NULL;
+ params.warmup = false;
- llama_context * ctx = llama_new_context_with_model(model, cparams);
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: unable to create context\n", __func__);
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 8c97f719b..6b076c839 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
### Example
+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 91c39c5ae..afac145f6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s\n", get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
+ GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
@@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+ guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
- std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+ std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
@@ -585,7 +586,7 @@ int main(int argc, char ** argv) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
- if(is_interacting && !params.interactive_first) {
+ if (is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
@@ -650,8 +651,8 @@ int main(int argc, char ** argv) {
// LOG_TEE("took new input\n");
is_interacting = false;
}
- // deal with end of text token in interactive mode
- else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+ // deal with end of generation tokens in interactive mode
+ else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n");
if (params.interactive) {
@@ -730,8 +731,8 @@ int main(int argc, char ** argv) {
}
}
- // end of text token
- if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+ // end of generation
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
break;
}
diff --git a/examples/json-schema-to-grammar.py b/examples/json_schema_to_grammar.py
similarity index 73%
rename from examples/json-schema-to-grammar.py
rename to examples/json_schema_to_grammar.py
index 91dd734cc..826cd3f72 100755
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,37 +6,94 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+ if not separator_rule:
+ if min_items == 0 and max_items == 1:
+ return f'{item_rule}?'
+ elif min_items == 1 and max_items is None:
+ return f'{item_rule}+'
+
+ result = ''
+
+ if min_items > 0:
+ if item_rule_is_literal and separator_rule is None:
+ result = '"' + (item_rule[1:-1] * min_items) + '"'
+ else:
+ result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+
+ def opt_repetitions(up_to_n, prefix_with_sep=False):
+ '''
+ - n=4, no sep: '(a (a (a (a)?)?)?)?'
+ - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
+ - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+ '''
+
+ content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
+ if up_to_n == 0:
+ return ''
+ elif up_to_n == 1:
+ return f'({content})?'
+ elif separator_rule and not prefix_with_sep:
+ return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
+ else:
+ return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
+
+ if min_items > 0 and max_items != min_items:
+ result += ' '
+
+ if max_items is not None:
+ result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+ else:
+ item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+
+ if min_items == 0 and separator_rule:
+ result = f'({item_rule} {item_operator}*)?'
+ else:
+ result += f'{item_operator}*'
+
+ return result
+
+
+class BuiltinRule:
+ def __init__(self, content: str, deps: list = None):
+ self.content = content
+ self.deps = deps or []
+
+_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
+
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
- 'boolean': '("true" | "false") space',
- 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
- 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
- 'value' : 'object | array | string | number | boolean',
- 'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
- 'array' : '"[" space ( value ("," space value)* )? "]" space',
- 'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
- 'string': r''' "\"" (
- [^"\\] |
- "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
- )* "\"" space''',
- 'null': '"null" space',
+ 'boolean' : BuiltinRule('("true" | "false") space', []),
+ 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+ 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+ 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+ 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+ 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+ 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+ 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+ 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+ 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+ 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+ 'null' : BuiltinRule('"null" space', []),
}
-OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats
-DATE_RULES = {
- 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
- 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
- 'date-time': 'date "T" time',
- 'date-string': '"\\"" date "\\"" space',
- 'time-string': '"\\"" time "\\"" space',
- 'date-time-string': '"\\"" date-time "\\"" space',
+STRING_FORMAT_RULES = {
+ 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
+ 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
+ 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
+ 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
}
-RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
+
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
-DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
-TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -55,7 +110,9 @@ class SchemaConverter:
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
- self._rules = {'space': SPACE_RULE}
+ self._rules = {
+ 'space': SPACE_RULE,
+ }
self._refs = {}
self._refs_being_resolved = set()
@@ -65,6 +122,29 @@ class SchemaConverter:
)
return f'"{escaped}"'
+ def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+ '''
+ not_literal('a') -> '[^a]'
+ not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+ '''
+ assert len(literal) > 0, 'Empty literal not supported'
+ def recurse(i: int):
+ c = literal[i]
+ if maybe_escaped_underscores and c == '_':
+ yield f'[^{c}\\\\]'
+ yield ' | '
+ yield f'"\\\\"? "{c}"'
+ else:
+ yield f'[^{c}]'
+ if i < len(literal) - 1:
+ yield ' | '
+ yield self._format_literal(c)
+ yield ' ('
+ yield from recurse(i + 1)
+ yield ')?'
+
+ return ''.join(('(', *recurse(0), ')'))
+
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -169,10 +249,10 @@ class SchemaConverter:
def get_dot():
if self._dotall:
- rule = '[\\U00000000-\\U0010FFFF]'
+ rule = DOTALL
else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
- rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+ rule = DOT
return self._add_rule(f'dot', rule)
def join_seq():
@@ -246,26 +326,14 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1]
- if min_times == 0 and max_times is None:
- seq[-1] = (f'{sub}*', False)
- elif min_times == 0 and max_times == 1:
- seq[-1] = (f'{sub}?', False)
- elif min_times == 1 and max_times is None:
- seq[-1] = (f'{sub}+', False)
- else:
- if not sub_is_literal:
- id = sub_rule_ids.get(sub)
- if id is None:
- id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
- sub_rule_ids[sub] = id
- sub = id
+ if not sub_is_literal:
+ id = sub_rule_ids.get(sub)
+ if id is None:
+ id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+ sub_rule_ids[sub] = id
+ sub = id
- seq[-1] = (
- ' '.join(
- ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
- ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
- False
- )
+ seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
else:
literal = ''
while i < length:
@@ -373,49 +441,47 @@ class SchemaConverter:
' "]" space')
else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
- list_item_operator = f'( "," space {item_rule_name} )'
- successive_items = ""
min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems")
- if min_items > 0:
- successive_items = list_item_operator * (min_items - 1)
- min_items -= 1
- if max_items is not None and max_items > min_items:
- successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
- else:
- successive_items += list_item_operator + "*"
- if min_items == 0:
- rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
- else:
- rule = f'"[" space {item_rule_name} {successive_items} "]" space'
- return self._add_rule(rule_name, rule)
+ return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
- return self._add_rule(
+ return self._add_primitive(
'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid']
)
- elif schema_type in (None, 'string') and schema_format in DATE_RULES:
- for t, r in DATE_RULES.items():
- self._add_rule(t, r)
- return schema_format + '-string'
+ elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+ prim_name = f'{schema_format}-string'
+ return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+ elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+ char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+ min_len = schema.get('minLength', 0)
+ max_len = schema.get('maxLength')
+
+ return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif (schema_type == 'object') or (len(schema) == 0):
- for n in OBJECT_RULE_NAMES:
- self._add_rule(n, PRIMITIVE_RULES[n])
- return self._add_rule(rule_name, 'object')
+ return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
- return self._add_rule(
- 'root' if rule_name == 'root' else schema_type,
- PRIMITIVE_RULES[schema_type]
- )
+ return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
+
+ def _add_primitive(self, name: str, rule: BuiltinRule):
+ n = self._add_rule(name, rule.content)
+
+ for dep in rule.deps:
+ dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+ assert dep_rule, f'Rule {dep} not known'
+ if dep not in self._rules:
+ self._add_primitive(dep, dep_rule)
+ return n
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order
@@ -437,7 +503,7 @@ class SchemaConverter:
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
- self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+ self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
)
optional_props.append("*")
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 27e113203..8b532c8b6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -190,7 +190,7 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
- /* n_threads */ {get_num_physical_cores()},
+ /* n_threads */ {get_math_cpu_count()},
/* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
index ce8ab3b70..4af9de303 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
- if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF("");
}
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index c249291ae..737f882fb 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
- if new_token_id == llama_token_eos(model) || n_cur == n_len {
+ if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
@@ -322,7 +322,7 @@ actor LlamaContext {
defer {
result.deallocate()
}
- let nTokens = llama_token_to_piece(model, token, result, 8)
+ let nTokens = llama_token_to_piece(model, token, result, 8, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
defer {
newResult.deallocate()
}
- let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+ let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 96b048525..413e433dd 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## Model conversion
-- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 67cb0f22b..d4810d42e 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## LLaVA 1.5
-- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 5954bf6cd..e431c7f70 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
+#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
@@ -23,7 +24,6 @@
#include
#include
#include
-#include
#include