Merge branch 'update-olmo-tokenizer' into fix-olmo-conversion

This commit is contained in:
nopperl 2024-05-05 21:10:52 +02:00
commit da41960dbe
139 changed files with 9263 additions and 3083 deletions

View file

@ -10,14 +10,12 @@ WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \ echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \ fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main cmake --build build --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

View file

@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN cmake -B build -DLLAMA_VULKAN=1 && \
cd build && \ cmake --build build --config Release --target main
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up # Clean up
WORKDIR / WORKDIR /

View file

@ -10,14 +10,12 @@ WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \ echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \ fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server cmake --build build --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

View file

@ -18,10 +18,8 @@ RUN apt-get update && \
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cd build && \ cmake --build build --config Release --target server
cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build . --config Release --target server
# Clean up # Clean up
WORKDIR / WORKDIR /

16
.flake8
View file

@ -1,3 +1,17 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = W503 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude =
# Do not traverse examples
examples,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory
.git,
# There's no value in checking cache directories
__pycache__,
# No need to include the build path
build,
# This contains builds that we don't want to check
dist # This is generated with `python build .` for package releases
# max-complexity = 10

View file

@ -32,7 +32,7 @@ on:
- cron: '04 2 * * *' - cron: '04 2 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -52,7 +52,19 @@ jobs:
ftype: q4_0 ftype: q4_0
pr_comment_enabled: "true" pr_comment_enabled: "true"
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
@ -96,9 +108,7 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
set -eux set -eux
mkdir build cmake -B build \
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \ -DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
@ -109,7 +119,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \ -DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \ -DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release; -DCMAKE_BUILD_TYPE=Release;
cmake --build . --config Release -j $(nproc) --target server cmake --build build --config Release -j $(nproc) --target server
- name: Download the dataset - name: Download the dataset
id: download_dataset id: download_dataset

View file

@ -593,6 +593,63 @@ jobs:
run: | run: |
make swift make swift
windows-msys2:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
include:
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
- { sys: CLANG64, env: clang-x86_64, build: Release }
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
with:
update: true
msystem: ${{matrix.sys}}
install: >-
base-devel
mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make LLAMA_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
cmake -B build
cmake --build build --config ${{ matrix.build }} -j $(nproc)
- name: Clean after building using CMake
shell: msys2 {0}
run: |
rm -rf build
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest

View file

@ -12,7 +12,7 @@ jobs:
steps: steps:
- uses: actions/stale@v5 - uses: actions/stale@v5
with: with:
exempt-issue-labels: "refactor,help wanted,good first issue,research" exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
days-before-issue-stale: 30 days-before-issue-stale: 30
days-before-issue-close: 14 days-before-issue-close: 14
stale-issue-label: "stale" stale-issue-label: "stale"

View file

@ -20,5 +20,4 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" plugins: "flake8-no-print"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

View file

@ -23,7 +23,7 @@ on:
- cron: '2 4 * * *' - cron: '2 4 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@ -41,23 +41,16 @@ jobs:
sanitizer: "" sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container:
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
steps: steps:
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
apt-get update sudo apt-get update
apt-get -y install \ sudo apt-get -y install \
build-essential \ build-essential \
xxd \ xxd \
git \ git \
cmake \ cmake \
python3-pip \
curl \ curl \
wget \ wget \
language-pack-en \ language-pack-en \
@ -70,6 +63,17 @@ jobs:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Verify server deps - name: Verify server deps
id: verify_server_deps id: verify_server_deps
run: | run: |
@ -90,20 +94,14 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -B build \
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \ -DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
@ -129,6 +127,7 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL - name: libCURL
id: get_libcurl id: get_libcurl
@ -142,10 +141,8 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cd build cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup - name: Python setup
id: setup_python id: setup_python

16
.gitignore vendored
View file

@ -2,6 +2,7 @@
*.a *.a
*.so *.so
*.gguf *.gguf
*.gguf.json
*.bin *.bin
*.exe *.exe
*.dll *.dll
@ -108,3 +109,18 @@ examples/server/*.mjs.hpp
poetry.lock poetry.lock
poetry.toml poetry.toml
nppBackup nppBackup
# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-spm
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops

View file

@ -3,13 +3,14 @@
exclude: prompts/.*.txt exclude: prompts/.*.txt
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0 rev: v4.6.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer
- id: check-yaml - id: check-yaml
- id: check-added-large-files - id: check-added-large-files
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 6.0.0 rev: 7.0.0
hooks: hooks:
- id: flake8 - id: flake8
additional_dependencies: [flake8-no-print]

View file

@ -43,11 +43,7 @@ else()
set(LLAMA_METAL_DEFAULT OFF) set(LLAMA_METAL_DEFAULT OFF)
endif() endif()
if (CMAKE_SYSTEM_NAME MATCHES "ANDROID") set(LLAMA_LLAMAFILE_DEFAULT ON)
set(LLAMA_LLAMAFILE_DEFAULT OFF)
else()
set(LLAMA_LLAMAFILE_DEFAULT ON)
endif()
# general # general
option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(BUILD_SHARED_LIBS "build shared libraries" OFF)

View file

@ -6,11 +6,23 @@ BUILD_TARGETS = \
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-autorelease \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-backend-ops \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-double-float \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ tests/test-grad0 \
tests/test-json-schema-to-grammar tests/test-grammar-integration tests/test-grammar-integration \
tests/test-grammar-parser \
tests/test-json-schema-to-grammar \
tests/test-llama-grammar \
tests/test-model-load-cancel \
tests/test-opt \
tests/test-quantize-fns \
tests/test-quantize-perf \
tests/test-rope \
tests/test-sampling \
tests/test-tokenizer-0 \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -27,6 +39,17 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
# of non-gcc compilers don't have to provide g++ alias or wrapper.
DEFCC := cc
DEFCXX := c++
ifeq ($(origin CC),default)
CC := $(DEFCC)
endif
ifeq ($(origin CXX),default)
CXX := $(DEFCXX)
endif
# Mac OS + Arm can report x86_64 # Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -49,11 +72,16 @@ default: $(BUILD_TARGETS)
test: $(TEST_TARGETS) test: $(TEST_TARGETS)
@failures=0; \ @failures=0; \
for test_target in $(TEST_TARGETS); do \ for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \ continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
continue; \ continue; \
@ -768,7 +796,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -971,11 +999,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -983,7 +1007,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -185,9 +185,8 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
```sh ```sh
git clone https://github.com/oneapi-src/oneMKL git clone https://github.com/oneapi-src/oneMKL
cd oneMKL cd oneMKL
mkdir -p buildWithCublas && cd buildWithCublas cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas cmake --build buildWithCublas --config Release
make
``` ```
@ -227,16 +226,15 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU # Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16 # Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#build all binary # build all binary
cmake --build . --config Release -j -v cmake --build build --config Release -j -v
``` ```
#### Nvidia GPU #### Nvidia GPU
@ -248,16 +246,15 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL # Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16 # Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
#build all binary # build all binary
cmake --build . --config Release -j -v cmake --build build --config Release -j -v
``` ```
@ -412,17 +409,15 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
On the oneAPI command line window, step into the llama.cpp main directory and run the following: On the oneAPI command line window, step into the llama.cpp main directory and run the following:
``` ```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 2: Or FP16 # Option 2: Or FP16
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
make -j cmake --build build --config Release -j
``` ```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:

145
README.md
View file

@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics ### Hot topics
- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387** - **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 - Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
@ -138,6 +139,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server** **HTTP server**
@ -306,6 +308,8 @@ In order to build llama.cpp you have three different options.
make make
``` ```
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
- On Windows: - On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -320,10 +324,24 @@ In order to build llama.cpp you have three different options.
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build
cd build cmake --build build --config Release
cmake .. ```
cmake --build . --config Release
**Note**: for `Debug` builds, there are two cases:
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
``` ```
- Using `Zig` (version 0.11 or later): - Using `Zig` (version 0.11 or later):
@ -437,10 +455,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` on Linux: - Using `CMake` on Linux:
```bash ```bash
mkdir build cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cd build cmake --build build --config Release
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
``` ```
- #### BLIS - #### BLIS
@ -460,11 +476,9 @@ Building the program with BLAS support may lead to some performance improvements
- Using manual oneAPI installation: - Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash ```bash
mkdir build
cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release cmake --build build --config Release
``` ```
- Using oneAPI docker image: - Using oneAPI docker image:
@ -485,10 +499,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build -DLLAMA_CUDA=ON
cd build cmake --build build --config Release
cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
``` ```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
@ -515,8 +527,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16 && cmake --build build --config Release -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
@ -562,15 +574,14 @@ Building the program with BLAS support may lead to some performance improvements
```sh ```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
mkdir OpenCL-SDK/build cd OpenCL-SDK
cd OpenCL-SDK/build cmake -B build -DBUILD_DOCS=OFF \
cmake .. -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \ -DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \ -DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \ -DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF -DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build . --config Release cmake --build build
cmake --install . --prefix /some/path cmake --install build --prefix /some/path
``` ```
</details> </details>
@ -592,23 +603,23 @@ Building the program with BLAS support may lead to some performance improvements
```cmd ```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64" set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast\build cd CLBlast
cd CLBlast\build cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64 cmake --build build --config Release
cmake --build . --config Release cmake --install build --prefix C:/CLBlast
cmake --install . --prefix C:/CLBlast
``` ```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details> - <details>
<summary>Unix:</summary> <summary>Unix:</summary>
```sh ```sh
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast/build cd CLBlast
cd CLBlast/build cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build build --config Release
cmake --build . --config Release cmake --install build --prefix /some/path
cmake --install . --prefix /some/path
``` ```
Where `/some/path` is where the built library will be installed (default is `/usr/local`). Where `/some/path` is where the built library will be installed (default is `/usr/local`).
@ -622,21 +633,17 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
- CMake (Unix): - CMake (Unix):
```sh ```sh
mkdir build cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cd build cmake --build build --config Release
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build . --config Release
``` ```
- CMake (Windows): - CMake (Windows):
```cmd ```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast" set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/LlamaCPP
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
``` ```
##### Running Llama with CLBlast ##### Running Llama with CLBlast
@ -692,10 +699,8 @@ Building the program with BLAS support may lead to some performance improvements
Then, build llama.cpp using the cmake command below: Then, build llama.cpp using the cmake command below:
```bash ```bash
mkdir -p build cmake -B build -DLLAMA_VULKAN=1
cd build cmake --build build --config Release
cmake .. -DLLAMA_VULKAN=1
cmake --build . --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU) # Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
@ -707,6 +712,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash ```bash
# obtain the official LLaMA model weights and place them in ./models # obtain the official LLaMA model weights and place them in ./models
ls ./models ls ./models
@ -972,48 +979,20 @@ Here is a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
#### Building the Project using Termux (F-Droid) #### Build on Android using Termux
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card. [Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
``` ```
apt install libopenblas apt update && apt upgrade -y
apt install git
``` ```
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages: It's recommended to move your model inside the `~/` directory for best performance:
``` ```
apt install ocl-icd opencl-headers opencl-clhpp clinfo cd storage/downloads
mv model.gguf ~/
``` ```
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below: [Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
```
cmake .
make
cp libclblast.so* $PREFIX/lib
cp ./include/clblast.h ../llama.cpp
```
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
```
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
```
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker ### Docker

View file

@ -161,7 +161,7 @@ function gg_run_test_scripts_debug {
set -e set -e
# TODO: too slow, run on dedicated node # TODO: too slow, run on dedicated node
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log #(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log #(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
@ -337,6 +337,7 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -517,7 +518,10 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"

View file

@ -67,7 +67,6 @@
#include <sys/syslimits.h> #include <sys/syslimits.h>
#endif #endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -77,7 +76,7 @@ int32_t get_num_physical_cores() {
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings; std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu" std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+ std::to_string(cpu) + "/topology/thread_siblings"); + std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) { if (!thread_siblings.is_open()) {
break; // no more cpus break; // no more cpus
@ -234,8 +233,54 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result; return result;
} }
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.val_f64 = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.val_bool = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else if (strncmp(sep, "str:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) {
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false;
}
strncpy(kvo.val_str, sep, 127);
kvo.val_str[127] = '\0';
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
llama_sampling_params& sparams = params.sparams; llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") { if (arg == "-s" || arg == "--seed") {
if (++i >= argc) { if (++i >= argc) {
@ -847,7 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.image = argv[i]; params.image.emplace_back(argv[i]);
return true; return true;
} }
if (arg == "-i" || arg == "--interactive") { if (arg == "-i" || arg == "--interactive") {
@ -902,6 +947,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true; params.cont_batching = true;
return true; return true;
} }
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
}
if (arg == "--color") { if (arg == "--color") {
params.use_color = true; params.use_color = true;
return true; return true;
@ -1089,6 +1138,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_print = std::stoi(argv[i]); params.n_print = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "--check-tensors") {
params.check_tensors = true;
return true;
}
if (arg == "--ppl-output-type") { if (arg == "--ppl-output-type") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1240,47 +1293,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
char* sep = strchr(argv[i], '='); if (!parse_kv_override(argv[i], params.kv_overrides)) {
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
}
else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
}
else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
}
else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
}
else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
}
else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.kv_overrides.push_back(kvo);
return true; return true;
} }
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
@ -1310,6 +1327,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return false; return false;
} }
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (params.hf_file.empty()) {
if (params.model.empty()) {
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
}
params.hf_file = params.model;
} else if (params.model.empty()) {
params.model = "models/" + string_split(params.hf_file, '/').back();
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front();
f = string_split(f, '/').back();
params.model = "models/" + f;
}
} else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH;
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false; bool invalid_param = false;
std::string arg; std::string arg;
@ -1338,10 +1378,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
// short-hand to avoid specifying --hf-file -> default it to --model gpt_params_handle_model_default(params);
if (!params.hf_repo.empty() && params.hf_file.empty()) {
params.hf_file = params.model;
}
if (params.escape) { if (params.escape) {
process_escapes(params.prompt); process_escapes(params.prompt);
@ -1480,8 +1517,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
if (llama_supports_mlock()) { if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
@ -1534,7 +1572,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --control-vector-layer-range START END\n"); printf(" --control-vector-layer-range START END\n");
printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
printf(" -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str()); printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
printf(" -md FNAME, --model-draft FNAME\n"); printf(" -md FNAME, --model-draft FNAME\n");
printf(" draft model for speculative decoding (default: unused)\n"); printf(" draft model for speculative decoding (default: unused)\n");
printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
@ -1551,9 +1589,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n"); printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print); printf(" print token count every N tokens (default: %d)\n", params.n_print);
printf(" --check-tensors check model tensor data for invalid values\n");
printf("\n"); printf("\n");
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_print_usage(); log_print_usage();
@ -1678,6 +1717,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
return parts; return parts;
} }
std::string string_strip(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && std::isspace(str[start])) {
start++;
}
while (end > start && std::isspace(str[end - 1])) {
end--;
}
return str.substr(start, end - start);
}
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) { std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map { std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K}, {"top_k", llama_sampler_type::TOP_K},
@ -1774,6 +1825,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
} else { } else {
@ -1838,6 +1890,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval = params.cb_eval; cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload; cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v); cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@ -1868,59 +1921,75 @@ void llama_batch_add(
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
static bool llama_download_file(CURL * curl, const char * url, const char * path) { static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool llama_download_file(const std::string & url, const std::string & path) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return false;
}
bool force_download = false; bool force_download = false;
// Set the URL, allow to follow http redirection // Set the URL, allow to follow http redirection
curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
#if defined(_WIN32) #if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows. // operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif #endif
// Check if the file already exists locally // Check if the file already exists locally
struct stat model_file_info; struct stat model_file_info;
auto file_exists = (stat(path, &model_file_info) == 0); auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files // If the file exists, check its JSON metadata companion file.
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string metadata_path = path + ".json";
char etag_path[PATH_MAX] = {0}; nlohmann::json metadata;
snprintf(etag_path, sizeof(etag_path), "%s.etag", path); std::string etag;
std::string last_modified;
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
char last_modified_path[PATH_MAX] = {0};
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
if (file_exists) { if (file_exists) {
auto * f_etag = fopen(etag_path, "r"); // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
if (f_etag) { std::ifstream metadata_in(metadata_path);
if (!fgets(etag, sizeof(etag), f_etag)) { if (metadata_in.good()) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); try {
metadata_in >> metadata;
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata["url"].is_string()) {
auto previous_url = metadata["url"].get<std::string>();
if (previous_url != url) {
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata["etag"].is_string()) {
etag = metadata["etag"];
}
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
last_modified = metadata["lastModified"];
}
} catch (const nlohmann::json::exception & e) {
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else { } else {
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
}
fclose(f_etag);
}
auto * f_last_modified = fopen(last_modified_path, "r");
if (f_last_modified) {
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
} else {
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
last_modified);
}
fclose(f_last_modified);
}
} }
// Send a HEAD request to retrieve the etag and last-modified headers // Send a HEAD request to retrieve the etag and last-modified headers
struct llama_load_model_from_url_headers { struct llama_load_model_from_url_headers {
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string etag;
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; std::string last_modified;
}; };
llama_load_model_from_url_headers headers; llama_load_model_from_url_headers headers;
{ {
@ -1928,38 +1997,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
// Convert header field name to lowercase static std::regex header_regex("([^:]+): (.*)\r\n");
for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { static std::regex etag_regex("ETag", std::regex_constants::icase);
buffer[i] = tolower(buffer[i]); static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
}
const char * etag_prefix = "etag: "; std::string header(buffer, n_items);
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { std::smatch match;
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
} }
const char * last_modified_prefix = "last-modified: ";
if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
} }
return n_items; return n_items;
}; };
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
CURLcode res = curl_easy_perform(curl); CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) { if (res != CURLE_OK) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
long http_code = 0; long http_code = 0;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) { if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed // HEAD not supported, we don't know if the file has changed
// force trigger downloading // force trigger downloading
@ -1968,28 +2036,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
} }
} }
// If the ETag or the Last-Modified headers are different: trigger a new download bool should_download = !file_exists || force_download;
bool should_download = !file_exists if (!should_download) {
|| force_download if (!etag.empty() && etag != headers.etag) {
|| (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|| (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) { if (should_download) {
char path_temporary[PATH_MAX] = {0}; std::string path_temporary = path + ".downloadInProgress";
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
if (file_exists) { if (file_exists) {
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path) != 0) { if (remove(path.c_str()) != 0) {
curl_easy_cleanup(curl); fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
return false; return false;
} }
} }
// Set the output file // Set the output file
auto * outfile = fopen(path_temporary, "wb"); std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
if (!outfile) { if (!outfile) {
curl_easy_cleanup(curl); fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
return false; return false;
} }
@ -1997,12 +2067,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
return fwrite(data, size, nmemb, (FILE *)fd); return fwrite(data, size, nmemb, (FILE *)fd);
}; };
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback)); curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
// display download progress // display download progress
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL // helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
@ -2021,51 +2091,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
// start the download // start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl); auto res = curl_easy_perform(curl.get());
if (res != CURLE_OK) { if (res != CURLE_OK) {
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
long http_code = 0; long http_code = 0;
curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) { if (http_code < 200 || http_code >= 400) {
fclose(outfile);
curl_easy_cleanup(curl);
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
return false; return false;
} }
// Clean up // Causes file to be closed explicitly here before we rename it.
fclose(outfile); outfile.reset();
// Write the new ETag to the .etag file // Write the updated JSON metadata file.
if (strlen(headers.etag) > 0) { metadata.update({
auto * etag_file = fopen(etag_path, "w"); {"url", url},
if (etag_file) { {"etag", headers.etag},
fputs(headers.etag, etag_file); {"lastModified", headers.last_modified}
fclose(etag_file); });
fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); std::ofstream(metadata_path) << metadata.dump(4);
} fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
}
// Write the new lastModified to the .etag file if (rename(path_temporary.c_str(), path.c_str()) != 0) {
if (strlen(headers.last_modified) > 0) { fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
auto * last_modified_file = fopen(last_modified_path, "w");
if (last_modified_file) {
fputs(headers.last_modified, last_modified_file);
fclose(last_modified_file);
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
headers.last_modified);
}
}
if (rename(path_temporary, path) != 0) {
curl_easy_cleanup(curl);
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
return false; return false;
} }
} }
@ -2083,15 +2136,7 @@ struct llama_model * llama_load_model_from_url(
return NULL; return NULL;
} }
// Initialize libcurl if (!llama_download_file(model_url, path_model)) {
auto * curl = curl_easy_init();
if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
}
if (!llama_download_file(curl, model_url, path_model)) {
return NULL; return NULL;
} }
@ -2105,7 +2150,6 @@ struct llama_model * llama_load_model_from_url(
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
if (!ctx_gguf) { if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
curl_easy_cleanup(curl);
return NULL; return NULL;
} }
@ -2117,8 +2161,6 @@ struct llama_model * llama_load_model_from_url(
gguf_free(ctx_gguf); gguf_free(ctx_gguf);
} }
curl_easy_cleanup(curl);
if (n_split > 1) { if (n_split > 1) {
char split_prefix[PATH_MAX] = {0}; char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
@ -2149,11 +2191,7 @@ struct llama_model * llama_load_model_from_url(
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
auto * curl = curl_easy_init(); return llama_download_file(split_url, split_path);
bool res = llama_download_file(curl, split_url, split_path);
curl_easy_cleanup(curl);
return res;
}, idx)); }, idx));
} }
@ -2640,7 +2678,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
@ -2675,6 +2713,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());

View file

@ -31,6 +31,8 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMMIT;
@ -92,7 +94,7 @@ struct gpt_params {
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download
@ -133,7 +135,7 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL-divergence bool kl_divergence = false; // compute KL divergence
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
@ -148,6 +150,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens bool ignore_eos = false; // ignore generated EOS tokens
@ -161,15 +164,20 @@ struct gpt_params {
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file std::vector<std::string> image; // path to image file(s)
}; };
void gpt_params_handle_model_default(gpt_params & params);
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@ -193,6 +201,7 @@ bool validate_file_name(const std::string & filename);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names); std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string); std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator); std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type); std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
// //

View file

@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD // USE LOG_TEE() INSTEAD
// //
#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \

View file

@ -68,7 +68,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) { if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL); seed = std::random_device{}();
} }
ctx->rng.seed(seed); ctx->rng.seed(seed);
} }

302
convert-hf-to-gguf-update.py Executable file
View file

@ -0,0 +1,302 @@
#!/usr/bin/env python3
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
#
import logging
import os
import requests
import sys
import json
from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
else:
logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models
for model in models:
name = model["name"]
repo = model["repo"]
tokt = model["tokt"]
if not os.path.exists(f"models/tokenizers/{name}"):
os.makedirs(f"models/tokenizers/{name}")
else:
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue
logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
# if downloaded file is less than 1KB, we likely need to download an LFS instead
if os.path.getsize(save_path) < 1024:
# remove the file
os.remove(save_path)
url = f"{repo}/resolve/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
if tokt == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model"
save_path = f"models/tokenizers/{name}/tokenizer.model"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer_config.json"
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = ""
for model in models:
name = model["name"]
tokt = model["tokt"]
if tokt == TOKENIZER_TYPE.SPM:
continue
# create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n"
src_func = f"""
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
if res is None:
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
print(src_func) # noqa: NP100
logger.info("\n")
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
chktxt,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line
for model in models:
name = model["name"]
tokt = model["tokt"]
# create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
# generate commands for creating vocab files
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
for model in models:
name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

View file

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import contextlib import contextlib
import json import json
@ -11,6 +12,7 @@ import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
import numpy as np import numpy as np
@ -25,6 +27,8 @@ import gguf
from convert import LlamaHfVocab, permute from convert import LlamaHfVocab, permute
logger = logging.getLogger("hf-to-gguf")
###### MODEL DEFINITIONS ###### ###### MODEL DEFINITIONS ######
@ -75,7 +79,7 @@ class Model(ABC):
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for part_name in self.part_names: for part_name in self.part_names:
print(f"gguf: loading model part '{part_name}'") logger.info(f"gguf: loading model part '{part_name}'")
ctx: ContextManager[Any] ctx: ContextManager[Any]
if self.is_safetensors: if self.is_safetensors:
from safetensors import safe_open from safetensors import safe_open
@ -94,42 +98,42 @@ class Model(ABC):
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
print(f"gguf: context length = {n_ctx}") logger.info(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
print(f"gguf: embedding length = {n_embd}") logger.info(f"gguf: embedding length = {n_embd}")
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
print(f"gguf: feed forward length = {n_ff}") logger.info(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"]) n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
print(f"gguf: head count = {n_head}") logger.info(f"gguf: head count = {n_head}")
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_head_count_kv(n_head_kv)
print(f"gguf: key-value head count = {n_head_kv}") logger.info(f"gguf: key-value head count = {n_head_kv}")
if (rope_theta := self.hparams.get("rope_theta")) is not None: if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta) self.gguf_writer.add_rope_freq_base(rope_theta)
print(f"gguf: rope theta = {rope_theta}") logger.info(f"gguf: rope theta = {rope_theta}")
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
print(f"gguf: rms norm epsilon = {f_rms_eps}") logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps) self.gguf_writer.add_layer_norm_eps(f_norm_eps)
print(f"gguf: layer norm epsilon = {f_norm_eps}") logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
if (n_experts := self.hparams.get("num_local_experts")) is not None: if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts) self.gguf_writer.add_expert_count(n_experts)
print(f"gguf: expert count = {n_experts}") logger.info(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used) self.gguf_writer.add_expert_used_count(n_experts_used)
print(f"gguf: experts used count = {n_experts_used}") logger.info(f"gguf: experts used count = {n_experts_used}")
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
print(f"gguf: file type = {self.ftype}") logger.info(f"gguf: file type = {self.ftype}")
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -150,8 +154,7 @@ class Model(ABC):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -168,7 +171,7 @@ class Model(ABC):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -229,7 +232,7 @@ class Model(ABC):
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
# used for GPT-2 BPE and WordPiece vocabs # used for GPT-2 BPE and WordPiece vocabs
def get_basic_vocab(self) -> tuple[list[str], list[int]]: def get_vocab_base(self) -> tuple[list[str], list[int], str]:
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -238,6 +241,8 @@ class Model(ABC):
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab()
@ -255,11 +260,85 @@ class Model(ABC):
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)
return tokens, toktypes return tokens, toktypes, tokpre
# NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.debug(f"chktok: {chktok}")
logger.debug(f"chkhsh: {chkhsh}")
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
res = "deepseek-llm"
if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
res = "deepseek-coder"
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
# ref: https://huggingface.co/tiiuae/falcon-7b
res = "falcon"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
res = "bert-bge"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/mosaicml/mpt-7b
res = "mpt"
if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
# ref: https://huggingface.co/bigcode/starcoder2-3b
res = "starcoder"
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if res is None:
logger.warning("\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {chkhsh}")
logger.warning("**************************************************************************************")
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")
return res
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes = self.get_basic_vocab() tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -277,6 +356,8 @@ class Model(ABC):
vocab_size = hparams["vocab_size"] vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size assert max(tokenizer.get_vocab().values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks mergeable_ranks = tokenizer.mergeable_ranks
@ -304,6 +385,7 @@ class Model(ABC):
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -365,9 +447,7 @@ class Model(ABC):
if vocab_size > len(tokens): if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens) pad_count = vocab_size - len(tokens)
print( logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
)
for i in range(1, pad_count + 1): for i in range(1, pad_count + 1):
tokens.append(f"[PAD{i}]") tokens.append(f"[PAD{i}]")
scores.append(-1000.0) scores.append(-1000.0)
@ -376,6 +456,7 @@ class Model(ABC):
assert len(tokens) == vocab_size assert len(tokens) == vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -397,6 +478,7 @@ class Model(ABC):
assert len(tokens) == vocab.vocab_size assert len(tokens) == vocab.vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -477,7 +559,7 @@ class BloomModel(Model):
), ),
axis=0, axis=0,
) )
print("re-format attention.linear_qkv.weight") logger.info("re-format attention.linear_qkv.weight")
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
data = np.concatenate( data = np.concatenate(
@ -488,13 +570,12 @@ class BloomModel(Model):
), ),
axis=0, axis=0,
) )
print("re-format attention.linear_qkv.bias") logger.info("re-format attention.linear_qkv.bias")
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -511,13 +592,13 @@ class BloomModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
if not has_lm_head and name == "word_embeddings.weight": if not has_lm_head and name == "word_embeddings.weight":
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("MPTForCausalLM") @Model.register("MPTForCausalLM")
@ -577,8 +658,7 @@ class MPTModel(Model):
else: else:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -595,7 +675,7 @@ class MPTModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -621,8 +701,7 @@ class OrionModel(Model):
elif "model_max_length" in self.hparams: elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"] ctx_length = self.hparams["model_max_length"]
else: else:
print("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
@ -660,8 +739,7 @@ class OrionModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -678,7 +756,7 @@ class OrionModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -703,8 +781,7 @@ class BaichuanModel(Model):
elif "model_max_length" in self.hparams: elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"] ctx_length = self.hparams["model_max_length"]
else: else:
print("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
@ -733,7 +810,7 @@ class BaichuanModel(Model):
for i in range(block_count): for i in range(block_count):
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
print(f"Unpacking and permuting layer {i}") logger.info(f"Unpacking and permuting layer {i}")
model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
self._reverse_hf_permute_part(w, 0, head_count, head_count) self._reverse_hf_permute_part(w, 0, head_count, head_count)
model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
@ -758,8 +835,7 @@ class BaichuanModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -776,7 +852,7 @@ class BaichuanModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
@ -840,6 +916,7 @@ class XverseModel(Model):
toktypes.append(toktype) toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -860,8 +937,7 @@ class XverseModel(Model):
elif "model_max_length" in self.hparams: elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"] ctx_length = self.hparams["model_max_length"]
else: else:
print("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
@ -910,8 +986,7 @@ class XverseModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -928,7 +1003,7 @@ class XverseModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
@ -1015,8 +1090,7 @@ class FalconModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1033,7 +1107,7 @@ class FalconModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1120,8 +1194,7 @@ class RefactModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1138,7 +1211,7 @@ class RefactModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1187,10 +1260,9 @@ class PersimmonModel(Model):
data = data_torch.to(torch.float32).squeeze().numpy() data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1255,8 +1327,7 @@ class StableLMModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1273,7 +1344,7 @@ class StableLMModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1289,8 +1360,7 @@ class StableLMModel(Model):
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
data = data.astype(np.float32) data = data.astype(np.float32)
@ -1298,7 +1368,7 @@ class StableLMModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1335,6 +1405,11 @@ class LlamaModel(Model):
self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
# Same as super class, but permuting q_proj, k_proj # Same as super class, but permuting q_proj, k_proj
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -1345,7 +1420,7 @@ class LlamaModel(Model):
experts = dict() experts = dict()
for name, data_torch in self.get_tensors(): for name, data_torch in self.get_tensors():
# we don't need these # we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue continue
old_dtype = data_torch.dtype old_dtype = data_torch.dtype
@ -1398,10 +1473,9 @@ class LlamaModel(Model):
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
continue continue
@ -1409,8 +1483,7 @@ class LlamaModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1427,7 +1500,7 @@ class LlamaModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1502,10 +1575,9 @@ class GrokModel(Model):
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
continue continue
@ -1513,8 +1585,7 @@ class GrokModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1531,7 +1602,7 @@ class GrokModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1564,7 +1635,7 @@ class DbrxModel(Model):
self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_layer_norm_eps(1e-5)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
print(f"gguf: file type = {self.ftype}") logger.info(f"gguf: file type = {self.ftype}")
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers") block_count = self.hparams.get("n_layers")
@ -1607,8 +1678,7 @@ class DbrxModel(Model):
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1616,8 +1686,7 @@ class DbrxModel(Model):
# Most of the codebase that takes in 1D tensors only handles F32 tensors # Most of the codebase that takes in 1D tensors only handles F32 tensors
# and most of the outputs tensors are F32. # and most of the outputs tensors are F32.
if data_dtype != np.float32 and n_dims == 1: if data_dtype != np.float32 and n_dims == 1:
print(f"Can not map tensor {name!r}: all 1D tensors must be F32") raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")
sys.exit()
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16: if self.ftype == 0 and data_dtype == np.float16:
@ -1627,7 +1696,7 @@ class DbrxModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1: if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1689,8 +1758,7 @@ class MiniCPMModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1707,7 +1775,7 @@ class MiniCPMModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1773,8 +1841,7 @@ class QwenModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1791,7 +1858,7 @@ class QwenModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1868,10 +1935,9 @@ class Qwen2MoeModel(Model):
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
continue continue
@ -1879,8 +1945,7 @@ class Qwen2MoeModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1897,7 +1962,7 @@ class Qwen2MoeModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -1942,8 +2007,7 @@ class GPT2Model(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -1960,13 +2024,13 @@ class GPT2Model(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
# note: GPT2 output is tied to (same as) wte in original model # note: GPT2 output is tied to (same as) wte in original model
if new_name == "token_embd.weight": if new_name == "token_embd.weight":
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
@ -2005,8 +2069,7 @@ class Phi3MiniModel(Model):
tokenizer_path = self.dir_model / 'tokenizer.model' tokenizer_path = self.dir_model / 'tokenizer.model'
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
print(f'Error: Missing {tokenizer_path}', file=sys.stderr) raise ValueError(f'Error: Missing {tokenizer_path}')
sys.exit(1)
tokenizer = SentencePieceProcessor(str(tokenizer_path)) tokenizer = SentencePieceProcessor(str(tokenizer_path))
@ -2044,7 +2107,7 @@ class Phi3MiniModel(Model):
for key in added_tokens_json: for key in added_tokens_json:
token_id = added_tokens_json[key] token_id = added_tokens_json[key]
if (token_id >= vocab_size): if (token_id >= vocab_size):
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue continue
tokens[token_id] = key.encode("utf-8") tokens[token_id] = key.encode("utf-8")
@ -2052,6 +2115,7 @@ class Phi3MiniModel(Model):
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -2125,8 +2189,7 @@ class PlamoModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
# shuffle for broadcasting of gqa in ggml_mul_mat # shuffle for broadcasting of gqa in ggml_mul_mat
if new_name.endswith("attn_q.weight"): if new_name.endswith("attn_q.weight"):
@ -2157,7 +2220,7 @@ class PlamoModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -2203,8 +2266,7 @@ class CodeShellModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -2221,13 +2283,13 @@ class CodeShellModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
if not has_lm_head and name == "transformer.wte.weight": if not has_lm_head and name == "transformer.wte.weight":
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("InternLM2ForCausalLM") @Model.register("InternLM2ForCausalLM")
@ -2249,7 +2311,7 @@ class InternLM2Model(Model):
toktypes: list[int] = [] toktypes: list[int] = []
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
print(f'Error: Missing {tokenizer_path}', file=sys.stderr) logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1) sys.exit(1)
sentencepiece_model = model.ModelProto() sentencepiece_model = model.ModelProto()
@ -2266,7 +2328,7 @@ class InternLM2Model(Model):
if text == b"\x00": if text == b"\x00":
# (TODO): fixme # (TODO): fixme
# Hack here and replace the \x00 characters. # Hack here and replace the \x00 characters.
print(f"InternLM2 convert token '{text}' to '🐉'!") logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
text = "🐉" text = "🐉"
toktype = SentencePieceTokenTypes.NORMAL toktype = SentencePieceTokenTypes.NORMAL
@ -2294,6 +2356,7 @@ class InternLM2Model(Model):
toktypes.append(SentencePieceTokenTypes.USER_DEFINED) toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -2306,7 +2369,7 @@ class InternLM2Model(Model):
# TODO: this is a hack, should be fixed # TODO: this is a hack, should be fixed
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
in chat mode so that the conversation can end normally.") in chat mode so that the conversation can end normally.")
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
@ -2351,8 +2414,7 @@ in chat mode so that the conversation can end normally.")
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -2369,7 +2431,7 @@ in chat mode so that the conversation can end normally.")
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
def write_tensors(self): def write_tensors(self):
@ -2443,7 +2505,7 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type) self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self): def set_vocab(self):
tokens, toktypes = self.get_basic_vocab() tokens, toktypes, tokpre = self.get_vocab_base()
self.vocab_size = len(tokens) self.vocab_size = len(tokens)
# we need this to validate the size of the token_type embeddings # we need this to validate the size of the token_type embeddings
@ -2461,6 +2523,7 @@ class BertModel(Model):
# add vocab to gguf # add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_tokenizer_model("bert")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -2479,8 +2542,11 @@ class BertModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy() data = data_torch.squeeze().numpy()
n_dims = len(data.shape) n_dims = len(data.shape)
@ -2496,7 +2562,7 @@ class BertModel(Model):
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
new_dtype = np.float32 new_dtype = np.float32
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
if data.dtype != new_dtype: if data.dtype != new_dtype:
data = data.astype(new_dtype) data = data.astype(new_dtype)
@ -2575,7 +2641,7 @@ class GemmaModel(Model):
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if name == "lm_head.weight": if name == "lm_head.weight":
print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
continue continue
old_dtype = data_torch.dtype old_dtype = data_torch.dtype
@ -2592,8 +2658,7 @@ class GemmaModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -2604,7 +2669,7 @@ class GemmaModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -2632,12 +2697,15 @@ class MambaModel(Model):
else: else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present # Use the GPT-NeoX tokenizer when no tokenizer files are present
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
neox_reader = gguf.GGUFReader(tokenizer_path, "r") neox_reader = gguf.GGUFReader(tokenizer_path, "r")
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
@ -2701,17 +2769,16 @@ class MambaModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
if name.endswith(".A_log"): if name.endswith(".A_log"):
print("A_log --> A ==> " + new_name) logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch) data_torch = -torch.exp(data_torch)
# assuming token_embd.weight is seen before output.weight # assuming token_embd.weight is seen before output.weight
if tok_embd is not None and new_name == output_name: if tok_embd is not None and new_name == output_name:
if torch.equal(tok_embd, data_torch): if torch.equal(tok_embd, data_torch):
print(f"{output_name} is equivalent to {tok_embd_name}, omitting") logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
continue continue
if new_name == tok_embd_name: if new_name == tok_embd_name:
tok_embd = data_torch tok_embd = data_torch
@ -2734,7 +2801,7 @@ class MambaModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -2794,8 +2861,7 @@ class OlmoModel(Model):
# map tensor names # map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
print(f"Can not map tensor {name!r}") raise ValueError(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
@ -2812,7 +2878,7 @@ class OlmoModel(Model):
if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2: if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@ -2844,6 +2910,8 @@ def parse_args() -> argparse.Namespace:
help="directory containing model file", help="directory containing model file",
) )
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)") parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
return parser.parse_args() return parser.parse_args()
@ -2851,6 +2919,8 @@ def parse_args() -> argparse.Namespace:
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
dir_model = args.model dir_model = args.model
if args.awq_path: if args.awq_path:
@ -2859,15 +2929,15 @@ def main() -> None:
tmp_model_path = args.model / "weighted_model" tmp_model_path = args.model / "weighted_model"
dir_model = tmp_model_path dir_model = tmp_model_path
if tmp_model_path.is_dir(): if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.") logger.info(f"{tmp_model_path} exists as a weighted model.")
else: else:
tmp_model_path.mkdir(parents=True, exist_ok=True) tmp_model_path.mkdir(parents=True, exist_ok=True)
print("Saving new weighted model ...") logger.info("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.") logger.info(f"Saved weighted model at {tmp_model_path}.")
if not dir_model.is_dir(): if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file=sys.stderr) logger.error(f'Error: {args.model} is not a directory')
sys.exit(1) sys.exit(1)
ftype_map = { ftype_map = {
@ -2881,7 +2951,7 @@ def main() -> None:
# output in the same directory as the model by default # output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
print(f"Loading model: {dir_model.name}") logger.info(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model) hparams = Model.load_hparams(dir_model)
@ -2889,20 +2959,20 @@ def main() -> None:
model_class = Model.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
print("Set model parameters") logger.info("Set model parameters")
model_instance.set_gguf_parameters() model_instance.set_gguf_parameters()
print("Set model tokenizer") logger.info("Set model tokenizer")
model_instance.set_vocab() model_instance.set_vocab()
if args.vocab_only: if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'") logger.info(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab() model_instance.write_vocab()
else: else:
print(f"Exporting model to '{fname_out}'") logger.info(f"Exporting model to '{fname_out}'")
model_instance.write() model_instance.write()
print(f"Model successfully exported to '{fname_out}'") logger.info(f"Model successfully exported to '{fname_out}'")
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import os import os
import struct import struct
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
logger = logging.getLogger("ggml-to-gguf")
class GGMLFormat(IntEnum): class GGMLFormat(IntEnum):
GGML = 0 GGML = 0
@ -125,7 +128,6 @@ class Tensor:
self.start_offset = offset self.start_offset = offset
self.len_bytes = n_bytes self.len_bytes = n_bytes
offset += n_bytes offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
@ -175,7 +177,7 @@ class GGMLModel:
offset += self.validate_header(data, offset) offset += self.validate_header(data, offset)
hp = Hyperparameters() hp = Hyperparameters()
offset += hp.load(data, offset) offset += hp.load(data, offset)
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype) self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab) offset += vocab.load(data, offset, hp.n_vocab)
@ -215,12 +217,12 @@ class GGMLToGGUF:
if float(hp.n_head) / float(x) == gqa: if float(hp.n_head) / float(x) == gqa:
n_kv_head = x n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self): def save(self):
print('* Preparing to save GGUF file') logger.info('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter( gguf_writer = gguf.GGUFWriter(
self.cfg.output, self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@ -230,11 +232,11 @@ class GGMLToGGUF:
if self.special_vocab is not None: if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer) self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer) self.add_tensors(gguf_writer)
print(" gguf: write header") logger.info(" gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print(" gguf: write metadata") logger.info(" gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors") logger.info(" gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
@ -250,7 +252,7 @@ class GGMLToGGUF:
name = cfg.name if cfg.name is not None else cfg.input.name name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError: except UnicodeDecodeError:
name = None name = None
print('* Adding model parameters and KV items') logger.info('* Adding model parameters and KV items')
if name is not None: if name is not None:
gguf_writer.add_name(name) gguf_writer.add_name(name)
gguf_writer.add_description(desc) gguf_writer.add_description(desc)
@ -281,12 +283,13 @@ class GGMLToGGUF:
def add_vocab(self, gguf_writer): def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama') gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
if self.vocab_override is not None: if self.vocab_override is not None:
vo = self.vocab_override vo = self.vocab_override
print('* Adding vocab item(s)') logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
@ -298,7 +301,7 @@ class GGMLToGGUF:
if len(toktypes) > 0: if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
return return
print(f'* Adding {hp.n_vocab} vocab item(s)') logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal tt = 1 # Normal
@ -333,7 +336,7 @@ class GGMLToGGUF:
def add_tensors(self, gguf_writer): def add_tensors(self, gguf_writer):
tensor_map = self.name_map tensor_map = self.name_map
data = self.data data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)') logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors: for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8') name = str(tensor.name, 'UTF-8')
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@ -343,7 +346,6 @@ class GGMLToGGUF:
temp = tempdims[1] temp = tempdims[1]
tempdims[1] = tempdims[0] tempdims[1] = tempdims[0]
tempdims[0] = temp tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor( gguf_writer.add_tensor(
mapped_name, mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@ -400,33 +402,35 @@ def handle_args():
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", default="spm,hfft", parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') logger.info(f'* Using config: {cfg}')
logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel() model = GGMLModel()
print('* Scanning GGML input file') logger.info('* Scanning GGML input file')
offset = model.load(data, 0) # noqa offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}') logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None
special_vocab = None special_vocab = None
if cfg.model_metadata_dir is not None: if cfg.model_metadata_dir is not None:
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}') logger.info(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}') logger.info(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}') logger.info(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML: if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF( converter = GGMLToGGUF(
model, data, cfg, model, data, cfg,
params_override = params_override, params_override = params_override,
@ -434,7 +438,7 @@ def main():
special_vocab = special_vocab special_vocab = special_vocab
) )
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') logger.info(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import json import json
import os import os
import struct import struct
@ -15,6 +16,9 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf import gguf
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("lora-to-gguf")
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -48,11 +52,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) < 2: if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]") logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
print( logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1) sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json") input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -70,7 +72,7 @@ if __name__ == '__main__':
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values(): if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}") logger.error(f"Error: unsupported architecture {arch_name}")
sys.exit(1) sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
@ -80,21 +82,21 @@ if __name__ == '__main__':
params = json.load(f) params = json.load(f)
if params["peft_type"] != "LORA": if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1) sys.exit(1)
if params["fan_in_fan_out"] is True: if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported") logger.error("Error: param fan_in_fan_out is not supported")
sys.exit(1) sys.exit(1)
if params["bias"] is not None and params["bias"] != "none": if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported") logger.error("Error: param bias is not supported")
sys.exit(1) sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora. # TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported # doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported") logger.error("Error: param modules_to_save is not supported")
sys.exit(1) sys.exit(1)
with open(output_path, "wb") as fout: with open(output_path, "wb") as fout:
@ -125,13 +127,13 @@ if __name__ == '__main__':
suffix = k[-len(lora_suffixes[0]):] suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])] k = k[: -len(lora_suffixes[0])]
else: else:
print(f"Error: unrecognized tensor name {orig_k}") logger.error(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1) sys.exit(1)
tname = name_map.get_name(k) tname = name_map.get_name(k)
if tname is None: if tname is None:
print(f"Error: could not map tensor name {orig_k}") logger.error(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama") logger.error(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1) sys.exit(1)
if suffix == ".lora_A.weight": if suffix == ".lora_A.weight":
@ -141,8 +143,8 @@ if __name__ == '__main__':
else: else:
assert False assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype) write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout) t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}") logger.info(f"Converted {input_json} and {input_model} to {output_path}")

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import os import os
import sys import sys
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
logger = logging.getLogger("persimmon-to-gguf")
def _flatten_dict(dct, tensors, prefix=None): def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict) assert isinstance(dct, dict)
@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None):
def _get_sentencepiece_tokenizer_info(dir_model: Path): def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model' tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path) logger.info('getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path)) tokenizer = SentencePieceProcessor(str(tokenizer_path))
print('gguf: adding tokens') logger.info('adding tokens')
tokens: list[bytes] = [] tokens: list[bytes] = []
scores: list[float] = [] scores: list[float] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -68,7 +71,9 @@ def main():
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file") parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release") parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
sys.path.append(str(args.adept_inference_dir)) sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path) persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args'] hparams = persimmon_model['args']
@ -99,6 +104,7 @@ def main():
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir) tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama') gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
@ -106,7 +112,7 @@ def main():
gguf_writer.add_eos_token_id(71013) gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count) tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map) logger.info(tensor_map)
for name in tensors.keys(): for name in tensors.keys():
data_torch = tensors[name] data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"): if name.endswith(".self_attention.rotary_emb.inv_freq"):
@ -116,22 +122,21 @@ def main():
data = data_torch.to(torch.float32).squeeze().numpy() data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None: if new_name is None:
print("Can not map tensor '" + name + "'") raise ValueError(f"Can not map tensor '{name}'")
sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
gguf_writer.add_tensor(new_name, data) gguf_writer.add_tensor(new_name, data)
print("gguf: write header") logger.info("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") logger.info("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
print("gguf: write tensors") logger.info("gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print(f"gguf: model successfully exported to '{args.outfile}'") logger.info(f"gguf: model successfully exported to '{args.outfile}'")
print("")
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import concurrent.futures import concurrent.futures
import enum import enum
@ -35,6 +36,8 @@ import gguf
if TYPE_CHECKING: if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias from typing_extensions import Self, TypeAlias
logger = logging.getLogger("convert")
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1) faulthandler.register(signal.SIGUSR1)
@ -643,7 +646,6 @@ class LlamaHfVocab(Vocab):
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -1033,12 +1035,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
# Check for a vocab size mismatch # Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size: if params.n_vocab == vocab.vocab_size:
print("Ignoring added_tokens.json since model matches vocab size without it.") logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
return return
if pad_vocab and params.n_vocab > vocab.vocab_size: if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size pad_count = params.n_vocab - vocab.vocab_size
print( logger.debug(
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>" f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
) )
for i in range(1, pad_count + 1): for i in range(1, pad_count + 1):
@ -1166,7 +1168,7 @@ class OutputFile:
elapsed = time.time() - start elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model))) padi = len(str(len(model)))
print( logger.info(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
) )
self.gguf.write_tensor_data(ndarray) self.gguf.write_tensor_data(ndarray)
@ -1281,12 +1283,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
# HF models permut or pack some of the tensors, so we need to undo that # HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}") logger.debug(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}") logger.debug(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@ -1299,15 +1301,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None: if name_new is None:
if skip_unknown: if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping") logger.warning(f"Unexpected tensor name: {name} - skipping")
continue continue
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip: if tensor_type in should_skip:
print(f"skipping tensor {name_new}") logger.debug(f"skipping tensor {name_new}")
continue continue
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
out[name_new] = lazy_tensor out[name_new] = lazy_tensor
return out return out
@ -1372,7 +1374,7 @@ def load_some_model(path: Path) -> ModelPlus:
paths = find_multifile_paths(path) paths = find_multifile_paths(path)
models_plus: list[ModelPlus] = [] models_plus: list[ModelPlus] = []
for path in paths: for path in paths:
print(f"Loading model file {path}") logger.info(f"Loading model file {path}")
models_plus.append(lazy_load_file(path)) models_plus.append(lazy_load_file(path))
model_plus = merge_multifile_models(models_plus) model_plus = merge_multifile_models(models_plus)
@ -1413,7 +1415,7 @@ class VocabFactory:
else: else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@ -1438,19 +1440,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
}[file_type] }[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths: if ret in model_paths:
sys.stderr.write( logger.error(
f"Error: Default output path ({ret}) would overwrite the input. " f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n") "Please explicitly specify a path using --outfile.")
sys.exit(1) sys.exit(1)
return ret return ret
def do_dump_model(model_plus: ModelPlus) -> None: def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.paths = {model_plus.paths!r}") print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
print(f"model_plus.vocab = {model_plus.vocab!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
for name, lazy_tensor in model_plus.model.items(): for name, lazy_tensor in model_plus.model.items():
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
def main(args_in: list[str] | None = None) -> None: def main(args_in: list[str] | None = None) -> None:
@ -1473,8 +1475,18 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
elif args.dump_single or args.dump:
# Avoid printing anything besides the dump output
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.INFO)
if args.no_vocab and args.vocab_only: if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab") raise ValueError("--vocab-only does not make sense with --no-vocab")
@ -1491,6 +1503,7 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
endianess = gguf.GGUFEndian.LITTLE endianess = gguf.GGUFEndian.LITTLE
if args.big_endian: if args.big_endian:
endianess = gguf.GGUFEndian.BIG endianess = gguf.GGUFEndian.BIG
@ -1513,7 +1526,7 @@ def main(args_in: list[str] | None = None) -> None:
"q8_0": GGMLFileType.MostlyQ8_0, "q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype] }[args.outtype]
print(f"params = {params}") logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@ -1528,15 +1541,14 @@ def main(args_in: list[str] | None = None) -> None:
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab) endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")
return return
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab vocab = model_plus.vocab
print(f"Vocab info: {vocab}") logger.info(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}") logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown) model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
@ -1544,11 +1556,11 @@ def main(args_in: list[str] | None = None) -> None:
outfile = args.outfile or default_outfile(model_plus.paths, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") logger.info(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]); printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n"); printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ; return 1 ;
@ -41,6 +41,7 @@ int main(int argc, char ** argv) {
int n_kv_max = 2048; int n_kv_max = 2048;
int n_batch = 2048; int n_batch = 2048;
int n_ubatch = 512; int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0; int is_pp_shared = 0;
int n_gpu_layers = 0; int n_gpu_layers = 0;
@ -66,23 +67,27 @@ int main(int argc, char ** argv) {
} }
if (argc >= 6) { if (argc >= 6) {
is_pp_shared = std::atoi(argv[5]); flash_attn = std::atoi(argv[5]);
} }
if (argc >= 7) { if (argc >= 7) {
n_gpu_layers = std::atoi(argv[6]); is_pp_shared = std::atoi(argv[6]);
} }
if (argc >= 8) { if (argc >= 8) {
n_pp = parse_list(argv[7]); n_gpu_layers = std::atoi(argv[7]);
} }
if (argc >= 9) { if (argc >= 9) {
n_tg = parse_list(argv[8]); n_pp = parse_list(argv[8]);
} }
if (argc >= 10) { if (argc >= 10) {
n_pl = parse_list(argv[9]); n_tg = parse_list(argv[9]);
}
if (argc >= 11) {
n_pl = parse_list(argv[10]);
} }
// init LLM // init LLM
@ -112,6 +117,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_max; ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch; ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch; ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -169,7 +175,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -32,6 +32,7 @@ struct split_params {
int n_split_tensors = 128; int n_split_tensors = 128;
std::string input; std::string input;
std::string output; std::string output;
bool no_tensor_first_split = false;
bool dry_run = false; bool dry_run = false;
}; };
@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n"); printf(" --split-max-size N(M|G) max size per split\n");
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n"); printf("\n");
} }
@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
arg_found = true; arg_found = true;
params.dry_run = true; params.dry_run = true;
} }
if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) { if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@ -200,10 +206,10 @@ struct split_strategy {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1; int i_split = -1;
struct gguf_context * ctx_out = NULL; struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&]() { auto new_ctx_out = [&](bool allow_no_tensors) {
i_split++; i_split++;
if (ctx_out != NULL) { if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0) { if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -220,7 +226,12 @@ struct split_strategy {
}; };
// initialize ctx_out for the first split // initialize ctx_out for the first split
new_ctx_out(); new_ctx_out(false);
// skip first split if no_tensor_first_split is set
if (params.no_tensor_first_split) {
new_ctx_out(true);
}
// process tensors one by one // process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@ -230,7 +241,7 @@ struct split_strategy {
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes; size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) { if (should_split(i, next_tensors_size)) {
new_ctx_out(); new_ctx_out(false);
curr_tensors_size = n_bytes; curr_tensors_size = n_bytes;
} else { } else {
curr_tensors_size = next_tensors_size; curr_tensors_size = next_tensors_size;

View file

@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
echo PASS echo PASS
echo echo
# 4. Split with no tensor in metadata # 4. Split with no tensors in the first split
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors $SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
#echo PASS echo PASS
#echo echo
# 4b. Test the sharded model is loading properly # 4b. Test the sharded model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
#echo PASS echo PASS
#echo echo
# 5. Merge # 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf

View file

@ -23,6 +23,7 @@ struct Stats {
}; };
struct StatParams { struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat"; std::string ofile = "imatrix.dat";
int n_output_frequency = 10; int n_output_frequency = 10;
int verbosity = 1; int verbosity = 1;
@ -46,7 +47,7 @@ private:
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
// //
void save_imatrix(const char * file_name) const; void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const; void keep_imatrix(int ncall) const;
}; };
@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
} }
void IMatrixCollector::keep_imatrix(int ncall) const { void IMatrixCollector::keep_imatrix(int ncall) const {
@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
if (file_name.empty()) file_name = "imatrix.dat"; if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_"; file_name += ".at_";
file_name += std::to_string(ncall); file_name += std::to_string(ncall);
save_imatrix(file_name.c_str()); save_imatrix(file_name.c_str(), m_params.dataset.c_str());
} }
void IMatrixCollector::save_imatrix(const char * fname) const { void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries)); out.write((const char *) &n_entries, sizeof(n_entries));
for (auto& p : m_stats) { for (const auto & p : m_stats) {
int len = p.first.size(); int len = p.first.size();
out.write((const char*)&len, sizeof(len)); out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len); out.write(p.first.c_str(), len);
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size(); int nval = p.second.values.size();
out.write((const char*)&nval, sizeof(nval)); out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
} }
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);
if (m_params.verbosity > 0) { if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
} }
} }
@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
} }
} }
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams)); g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) { if (!combine_files.empty()) {
@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
} }
} }
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View file

@ -174,9 +174,11 @@ struct cmd_params {
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu; std::vector<int> main_gpu;
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<std::vector<float>> tensor_split; std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap; std::vector<bool> use_mmap;
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -195,9 +197,11 @@ static const cmd_params cmd_params_defaults = {
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0}, /* main_gpu */ {0},
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true}, /* use_mmap */ {true},
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -220,7 +224,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -393,6 +399,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") { } else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -477,6 +501,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
@ -498,6 +523,7 @@ struct cmd_params_instance {
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings; bool embeddings;
@ -532,6 +558,7 @@ struct cmd_params_instance {
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload; cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings; cparams.embeddings = embeddings;
return cparams; return cparams;
@ -554,6 +581,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) { if (n_prompt == 0) {
@ -572,6 +600,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd, /* .embeddings = */ embd,
@ -596,6 +625,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd, /* .embeddings = */ embd,
@ -633,6 +663,7 @@ struct test {
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings; bool embeddings;
@ -657,6 +688,7 @@ struct test {
split_mode = inst.split_mode; split_mode = inst.split_mode;
main_gpu = inst.main_gpu; main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload; no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
tensor_split = inst.tensor_split; tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap; use_mmap = inst.use_mmap;
embeddings = inst.embeddings; embeddings = inst.embeddings;
@ -731,7 +763,7 @@ struct test {
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
@ -753,7 +785,7 @@ struct test {
} }
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -787,7 +819,7 @@ struct test {
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -955,6 +987,9 @@ struct markdown_printer : public printer {
if (field == "no_kv_offload") { if (field == "no_kv_offload") {
return "nkvo"; return "nkvo";
} }
if (field == "flash_attn") {
return "fa";
}
if (field == "use_mmap") { if (field == "use_mmap") {
return "mmap"; return "mmap";
} }
@ -1001,6 +1036,9 @@ struct markdown_printer : public printer {
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload"); fields.emplace_back("no_kv_offload");
} }
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
fields.emplace_back("flash_attn");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split"); fields.emplace_back("tensor_split");
} }
@ -1191,6 +1229,7 @@ int main(int argc, char ** argv) {
llama_log_set(llama_null_log_callback, NULL); llama_log_set(llama_null_log_callback, NULL);
} }
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa);
// initialize printer // initialize printer
std::unique_ptr<printer> p; std::unique_ptr<printer> p;

View file

@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -425,6 +426,7 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w; struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
bool use_gelu = false; bool use_gelu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf; struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data; struct ggml_context * ctx_data;
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
ggml_set_name(embeddings, "embeddings"); ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings); ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm // pre-layernorm
{ if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln"); ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur; embeddings = cur;
} }
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector // llava projector
{ {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1149,11 +1171,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& e) { } catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__); LOG_TEE("%s: failed to load vision model tensors\n", __func__);
} }

View file

@ -113,11 +113,11 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
// load and preprocess the image // load and preprocess the image
llava_image_embed * embed = NULL; llava_image_embed * embed = NULL;
@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
if (!embed) { if (!embed) {
LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str()); fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL; return NULL;
} }
} }
@ -207,17 +207,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
printf("\n"); printf("\n");
} }
static struct llama_model * llava_init(gpt_params * params) {
static struct llava_context * llava_init(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_backend_init(); llama_backend_init();
llama_numa_init(params->numa); llama_numa_init(params->numa);
@ -228,6 +218,19 @@ static struct llava_context * llava_init(gpt_params * params) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model;
}
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@ -286,15 +289,18 @@ int main(int argc, char ** argv) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
auto model = llava_init(&params);
auto ctx_llava = llava_init(&params); if (model == NULL) {
if (ctx_llava == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
LOG_TEE("%s: error: failed to init llava\n", __func__);
return 1; return 1;
} }
auto image_embed = load_image(ctx_llava, &params); for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) { if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1; return 1;
} }
@ -302,8 +308,11 @@ int main(int argc, char ** argv) {
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama); llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);
}
llama_free_model(model);
return 0; return 0;
} }

View file

@ -17,11 +17,9 @@ In this case, CLBlast was already installed so the CMake package is referenced i
```cmd ```cmd
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/LlamaCPP
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
``` ```
### Build main-cmake-pkg ### Build main-cmake-pkg
@ -29,9 +27,7 @@ cmake --install . --prefix C:/LlamaCPP
```cmd ```cmd
cd ..\examples\main-cmake-pkg cd ..\examples\main-cmake-pkg
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/MyLlamaApp
cmake --build . --config Release
cmake --install . --prefix C:/MyLlamaApp
``` ```

View file

@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.

View file

@ -324,7 +324,7 @@ int main(int argc, char ** argv) {
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token token to recalculate the cached logits // reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
@ -544,7 +544,7 @@ int main(int argc, char ** argv) {
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break; break;

View file

@ -1,8 +1,118 @@
# perplexity # Perplexity
TODO The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
Perplexity measures how well the model can predict the next token with lower values being better.
Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
By default only the mean perplexity value and the corresponding uncertainty is calculated.
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
More statistics can be obtained by recording the logits from the FP16 version of a model.
To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
The program will then record all logits and save them to the provided path in binary format.
**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
* Pearson correlation coefficient of the "correct" token probabilites between models.
* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
## LLaMA 3 8b Scoreboard
Results are sorted by Kullback-Leibler divergence relative to FP16.
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
| f16 | None | 14.97 | 6.233160 ± 0.037828 | - | - | - | - |
| q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % |
| q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % |
| q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % |
| q5_K_S | None | 5.21 | 6.336598 ± 0.038755 | 0.104964 ± 0.003331 | 0.016595 ± 0.000122 | -0.223 ± 0.010 % | 3.918 ± 0.036 % |
| q5_1 | None | 5.65 | 6.337857 ± 0.038677 | 0.106223 ± 0.003476 | 0.018045 ± 0.000139 | -0.287 ± 0.011 % | 4.123 ± 0.039 % |
| q5_0 | None | 5.21 | 6.363224 ± 0.038861 | 0.131591 ± 0.003894 | 0.022239 ± 0.000166 | -0.416 ± 0.012 % | 4.634 ± 0.043 % |
| q4_K_M | WT 10m | 4.58 | 6.382937 ± 0.039055 | 0.151303 ± 0.004429 | 0.028152 ± 0.000240 | -0.389 ± 0.014 % | 5.251 ± 0.049 % |
| q4_K_M | None | 4.58 | 6.407115 ± 0.039119 | 0.175482 ± 0.004620 | 0.031273 ± 0.000238 | -0.596 ± 0.014 % | 5.519 ± 0.050 % |
| q4_K_S | WT 10m | 4.37 | 6.409697 ± 0.039189 | 0.178064 ± 0.004744 | 0.031951 ± 0.000259 | -0.531 ± 0.015 % | 5.645 ± 0.051 % |
| iq4_NL | WT 10m | 4.35 | 6.455593 ± 0.039630 | 0.223959 ± 0.005201 | 0.035742 ± 0.000288 | -0.590 ± 0.016 % | 5.998 ± 0.054 % |
| iq4_XS | WT 10m | 4.14 | 6.459705 ± 0.039595 | 0.228071 ± 0.005207 | 0.036334 ± 0.000284 | -0.668 ± 0.016 % | 6.044 ± 0.054 % |
| q4_K_S | None | 4.37 | 6.500529 ± 0.039778 | 0.268895 ± 0.005638 | 0.043136 ± 0.000314 | -0.927 ± 0.017 % | 6.562 ± 0.055 % |
| q4_1 | None | 4.78 | 6.682737 ± 0.041285 | 0.451103 ± 0.008030 | 0.071683 ± 0.000505 | -0.927 ± 0.017 % | 8.512 ± 0.063 % |
| q4_0 | None | 4.34 | 6.700147 ± 0.041226 | 0.468514 ± 0.007951 | 0.071940 ± 0.000491 | -1.588 ± 0.022 % | 8.434 ± 0.061 % |
| q3_K_L | WT 10m | 4.03 | 6.671223 ± 0.041427 | 0.439590 ± 0.008154 | 0.073077 ± 0.000529 | -0.940 ± 0.023 % | 8.662 ± 0.064 % |
| q3_K_M | WT 10m | 3.74 | 6.734255 ± 0.041838 | 0.502622 ± 0.008901 | 0.084358 ± 0.000588 | -1.198 ± 0.024 % | 9.292 ± 0.065 % |
| q3_K_L | None | 4.03 | 6.787876 ± 0.042104 | 0.556242 ± 0.009171 | 0.087176 ± 0.000614 | -1.532 ± 0.025 % | 9.432 ± 0.067 % |
| q3_K_M | None | 3.74 | 6.888498 ± 0.042669 | 0.656864 ± 0.010071 | 0.101913 ± 0.000677 | -1.990 ± 0.026 % | 10.203 ± 0.068 % |
| iq3_M | WT 10m | 3.53 | 6.898327 ± 0.041643 | 0.666694 ± 0.009449 | 0.102534 ± 0.000663 | -3.178 ± 0.026 % | 10.513 ± 0.066 % |
| iq3_S | WT 10m | 3.42 | 6.965501 ± 0.042406 | 0.733867 ± 0.010245 | 0.111278 ± 0.000710 | -3.066 ± 0.027 % | 10.845 ± 0.068 % |
| iq3_XS | WT 10m | 3.28 | 7.163043 ± 0.043772 | 0.931409 ± 0.012084 | 0.138693 ± 0.000857 | -3.667 ± 0.031 % | 12.148 ± 0.070 % |
| iq3_XXS | WT 10m | 3.05 | 7.458436 ± 0.046404 | 1.226803 ± 0.015234 | 0.183625 ± 0.001042 | -3.918 ± 0.035 % | 13.836 ± 0.074 % |
| q3_K_S | WT 10m | 3.41 | 7.602878 ± 0.046848 | 1.371244 ± 0.015688 | 0.199821 ± 0.001008 | -5.046 ± 0.037 % | 14.980 ± 0.070 % |
| q3_K_S | None | 3.41 | 7.863786 ± 0.048885 | 1.632152 ± 0.017733 | 0.228217 ± 0.001079 | -5.604 ± 0.038 % | 15.541 ± 0.070 % |
| iq2_M | WT 10m | 2.74 | 8.600799 ± 0.055124 | 2.369166 ± 0.025244 | 0.325989 ± 0.00160 | -6.463 ± 0.046 % | 18.519 ± 0.080 % |
| q2_K | WT 10k | 2.96 | 8.652290 ± 0.055572 | 2.420657 ± 0.025587 | 0.331393 ± 0.001562 | -6.606 ± 0.046 % | 18.790 ± 0.078 % |
| q2_K | WT 100k | 2.96 | 8.641993 ± 0.055406 | 2.410359 ± 0.025495 | 0.331672 ± 0.001569 | -6.628 ± 0.047 % | 18.856 ± 0.078 % |
| q2_K | WT 10m | 2.96 | 8.647825 ± 0.055610 | 2.416191 ± 0.025683 | 0.332223 ± 0.001572 | -6.500 ± 0.047 % | 18.881 ± 0.078 % |
| q2_K | WT 1m | 2.96 | 8.674365 ± 0.055743 | 2.442732 ± 0.025843 | 0.335308 ± 0.001576 | -6.634 ± 0.047 % | 19.009 ± 0.079 % |
| q2_K | WT 1k | 2.96 | 8.682605 ± 0.055916 | 2.450972 ± 0.026069 | 0.337093 ± 0.001596 | -6.596 ± 0.047 % | 18.977 ± 0.079 % |
| q2_K_S | WT 10m | 2.96 | 9.323778 ± 0.061551 | 3.092145 ± 0.031914 | 0.403360 ± 0.001787 | -7.131 ± 0.049 % | 20.050 ± 0.081 % |
| q2_K_S | WT 1m | 2.96 | 9.329321 ± 0.061378 | 3.097688 ± 0.031816 | 0.403590 ± 0.001797 | -7.289 ± 0.049 % | 20.123 ± 0.081 % |
| q2_K_S | WT 100k | 2.96 | 9.362973 ± 0.061740 | 3.131339 ± 0.032169 | 0.408367 ± 0.001802 | -7.198 ± 0.050 % | 20.132 ± 0.081 % |
| q2_K_S | WT 10k | 2.96 | 9.376479 ± 0.062045 | 3.144846 ± 0.032464 | 0.408662 ± 0.001819 | -7.141 ± 0.050 % | 20.120 ± 0.081 % |
| q2_K_S | WT 1k | 2.96 | 9.415200 ± 0.062475 | 3.183567 ± 0.032993 | 0.415865 ± 0.001846 | -7.153 ± 0.050 % | 20.311 ± 0.082 % |
| iq2_S | WT 10m | 2.56 | 9.650781 ± 0.063209 | 3.419148 ± 0.034017 | 0.439197 ± 0.001976 | -8.319 ± 0.052 % | 21.491 ± 0.083 % |
| q2_K | None | 2.96 | 9.751568 ± 0.063312 | 3.519934 ± 0.033863 | 0.445132 ± 0.001835 | -9.123 ± 0.051 % | 21.421 ± 0.079 % |
| iq2_XS | WT 10m | 2.43 | 10.761424 ± 0.071056 | 4.529791 ± 0.042229 | 0.546290 ± 0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
| iq2_XXS | WT 10m | 2.24 | 14.091782 ± 0.098396 | 7.860148 ± 0.070752 | 0.812022 ± 0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
| iq1_M | WT 10m | 2.01 | 25.493722 ± 0.177903 | 19.262089 ± 0.152396 | 1.393084 ± 0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
| iq1_S | WT 1m | 1.88 | 58.097760 ± 0.438604 | 51.866126 ± 0.416604 | 2.211278 ± 0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
| iq1_S | WT 1k | 1.88 | 58.267851 ± 0.446208 | 52.036218 ± 0.424373 | 2.214858 ± 0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
| iq1_S | WT 100k | 1.88 | 58.581498 ± 0.453145 | 52.349864 ± 0.431360 | 2.220834 ± 0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
| iq1_S | WT 10m | 1.88 | 60.694593 ± 0.471290 | 54.462959 ± 0.449644 | 2.254554 ± 0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
| iq1_S | WT 10k | 1.88 | 63.221324 ± 0.493077 | 56.989691 ± 0.471423 | 2.293527 ± 0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
## LLaMA 2 vs. LLaMA 3 Quantization comparison
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
| Mean PPL ratio | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
| Mean ΔPPL | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
| PPL correlation | 97.36% | 89.62% | 99.71% | 99.34% | 99.94% | 99.88% | 99.98% | 99.96% |
| Mean KLD | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
| Mean Δp | -2.710 ± 0.023 % | -9.123 ± 0.051 % | -0.416 ± 0.008 % | -0.596 ± 0.014 % | -0.035 ± 0.003 % | -0.007 ± 0.006 % | -0.005 ± 0.002 % | -0.019 ± 0.003 % |
| Maximum Δp | 85.136% | 94.268% | 45.209% | 95.054% | 23.593% | 53.601% | 43.925% | 28.734% |
| 99.9% Δp | 37.184% | 50.003% | 17.461% | 27.084% | 7.798% | 13.613% | 3.387% | 6.402% |
| 99.0% Δp | 18.131% | 25.875% | 7.798% | 12.084% | 3.838% | 6.407% | 1.867% | 3.544% |
| Median Δp | -0.391% | -2.476% | -0.026% | -0.024% | -0.001% | 0.000% | -0.000% | -0.000% |
| 1.0% Δp | -39.762% | -87.173% | -11.433% | -19.567% | -4.222% | -6.767% | -1.862% | -3.698% |
| 0.1% Δp | -79.002% | -98.897% | -26.433% | -56.054% | -9.091% | -16.584% | -3.252% | -6.579% |
| Minimum Δp | -99.915% | -99.965% | -83.383% | -98.699% | -43.142% | -68.487% | -9.343% | -24.301% |
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
## Old Numbers
<details>
<summary>Llama 2 70B Scoreboard</summary>
## Llama 2 70B Scorechart
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | | Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|--------------|------------------|------------|---------------| |--------------|------------------|------------|---------------|
| Q4_0 | 36.20 | 3.5550 | 3.61% | | Q4_0 | 36.20 | 3.5550 | 3.61% |
@ -18,3 +128,5 @@ TODO
| Q5_K_M | 45.41 | 3.4451 | 0.40% | | Q5_K_M | 45.41 | 3.4451 | 0.40% |
| Q6_K | 52.70 | 3.4367 | 0.16% | | Q6_K | 52.70 | 3.4367 | 0.16% |
| fp16 | 128.5 | 3.4313 | - | | fp16 | 128.5 | 3.4313 | - |
</details>

View file

@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
} }
struct kl_divergence_result { struct kl_divergence_result {
double sum_nll = 0; double sum_nll = 0.0;
double sum_nll2 = 0; double sum_nll2 = 0.0;
double sum_kld = 0; double sum_nll_base = 0.0;
double sum_kld2 = 0; double sum_nll_base2 = 0.0;
double sum_nll_diff = 0; double sum_nll_nll_base = 0.0;
double sum_nll_diff2 = 0; double sum_kld = 0.0;
size_t n_same_top = 0; double sum_kld2 = 0.0;
size_t count = 0; double sum_p_diff = 0.0;
double sum_p_diff2 = 0.0;
double sum_p_diff4 = 0.0;
float max_p_diff = 0.0f;
size_t n_same_top = 0.0;
size_t count = 0.0;
}; };
static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
float max_logit = logits[0]; float max_logit = logits[0];
int imax = 0; int imax = 0;
for (int i = 1; i < n_vocab; ++i) { for (int i = 1; i < n_vocab; ++i) {
@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
const float scale = d[0]; const float scale = d[0];
const float min_log_prob = d[1]; const float min_log_prob = d[1];
base_log_prob += 4; base_log_prob += 4;
float nll = max_logit + log_sum_exp - logits[tok];
const float nll = max_logit + log_sum_exp - logits[tok];
kld.sum_nll += nll; kld.sum_nll += nll;
kld.sum_nll2 += nll*nll; kld.sum_nll2 += nll*nll;
nll += (scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff += nll; const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff2 += nll*nll; kld.sum_nll_base += nll_base;
kld.sum_nll_base2 += nll_base*nll_base;
kld.sum_nll_nll_base += nll*nll_base;
max_logit += log_sum_exp; max_logit += log_sum_exp;
double sum = 0; double sum = 0;
int imax_base = -1; int imax_base = -1;
@ -269,16 +279,26 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
kld.sum_kld2 += sum*sum; kld.sum_kld2 += sum*sum;
++kld.count; ++kld.count;
if (imax == imax_base) ++kld.n_same_top; if (imax == imax_base) ++kld.n_same_top;
return sum;
const float p_base = expf(-nll_base);
const float p = expf(-nll);
const float p_diff = p - p_base;
kld.sum_p_diff += p_diff;
const double p_diff2 = p_diff*p_diff;
kld.sum_p_diff2 += p_diff2;
kld.sum_p_diff4 += p_diff2*p_diff2;
kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
return std::make_pair(sum, p_diff);
} }
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld, std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
float * kld_values) { float * kld_values, float * p_diff_values) {
std::mutex mutex; std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4; const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () { auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
kl_divergence_result local_kld; kl_divergence_result local_kld;
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> lock(mutex);
@ -286,17 +306,23 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
if (i >= n_token) { if (i >= n_token) {
kld.sum_nll += local_kld.sum_nll; kld.sum_nll += local_kld.sum_nll;
kld.sum_nll2 += local_kld.sum_nll2; kld.sum_nll2 += local_kld.sum_nll2;
kld.sum_nll_base += local_kld.sum_nll_base;
kld.sum_nll_base2 += local_kld.sum_nll_base2;
kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
kld.sum_kld += local_kld.sum_kld; kld.sum_kld += local_kld.sum_kld;
kld.sum_kld2 += local_kld.sum_kld2; kld.sum_kld2 += local_kld.sum_kld2;
kld.sum_nll_diff += local_kld.sum_nll_diff; kld.sum_p_diff += local_kld.sum_p_diff;
kld.sum_nll_diff2 += local_kld.sum_nll_diff2; kld.sum_p_diff2 += local_kld.sum_p_diff2;
kld.sum_p_diff4 += local_kld.sum_p_diff4;
kld.n_same_top += local_kld.n_same_top; kld.n_same_top += local_kld.n_same_top;
kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff);
kld.count += local_kld.count; kld.count += local_kld.count;
break; break;
} }
lock.unlock(); lock.unlock();
double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v; kld_values[i] = (float)v.first;
p_diff_values[i] = v.second;
} }
}; };
for (auto & w : workers) { for (auto & w : workers) {
@ -1712,6 +1738,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits; std::vector<float> logits;
if (num_batches > 1) { if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab); logits.reserve(n_ctx * n_vocab);
@ -1728,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.; df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
return std::make_pair(f, df); return std::make_pair(f, df);
}; };
auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
if (count < 10) {
return 0.0;
}
double var = sumab/count - (suma/count)*(sumb/count);
var /= count - 1;
return var;
};
kl_divergence_result kld; kl_divergence_result kld;
auto kld_ptr = kld_values.data(); auto kld_ptr = kld_values.data();
auto p_diff_ptr = p_diff_values.data();
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
@ -1785,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n"); printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
} }
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr); workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first;
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); printf("%4d", i+1);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
auto p_top = 1.*kld.n_same_top/kld.count;
auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first), auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second, const double ppl_val = exp(log_ppl.first);
p_top, d_p_top); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
printf("\n");
fflush(stdout); fflush(stdout);
@ -1813,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (kld.count < 100) return; // we do not wish to do statistics on so few values if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end()); std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end());
printf("===== KL-divergence statistics\n"); printf("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
printf("\n");
printf("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second); printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2]; : kld_values[kld_values.size()/2];
printf("Median : %10.6f\n", kld_median);
auto percentile = [&kld_values] (float fraction) { auto percentile = [] (std::vector<float> values, float fraction) {
if (fraction <= 0) return kld_values.front(); if (fraction <= 0) return values.front();
if (fraction >= 1) return kld_values.back(); if (fraction >= 1) return values.back();
float p = fraction*(kld_values.size() - 1); float p = fraction*(values.size() - 1);
size_t ip = size_t(p); p -= ip; size_t ip = size_t(p); p -= ip;
return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)]; return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
}; };
printf("Maximum: %10.6f\n", kld_values.back()); printf("Maximum KLD: %10.6f\n", kld_values.back());
printf("KLD_99 : %10.6f\n", percentile(0.99f)); printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
printf("KLD_95 : %10.6f\n", percentile(0.95f)); printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("KLD_90 : %10.6f\n", percentile(0.90f)); printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("Median KLD: %10.6f\n", kld_median);
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
printf("Minimum KLD: %10.6f\n", kld_values.front());
printf("Minimum: %10.6f\n", kld_values.front()); printf("\n");
printf("KLD_01 : %10.6f\n", percentile(0.01f));
printf("KLD_05 : %10.6f\n", percentile(0.05f)); printf("====== Token probability statistics ======\n");
printf("KLD_10 : %10.6f\n", percentile(0.10f));
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2];
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count;
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
} }

View file

@ -23,7 +23,7 @@
#endif #endif
struct quantize_stats_params { struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.gguf"; std::string model = DEFAULT_MODEL_PATH;
bool verbose = false; bool verbose = false;
bool per_layer_stats = false; bool per_layer_stats = false;
bool print_histogram = false; bool print_histogram = false;

View file

@ -1,6 +1,6 @@
set(TARGET quantize) set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -8,7 +8,6 @@
#include <unordered_map> #include <unordered_map>
#include <fstream> #include <fstream>
#include <cmath> #include <cmath>
#include <algorithm>
struct quant_option { struct quant_option {
std::string name; std::string name;
@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
}; };
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
@ -113,7 +116,7 @@ static void usage(const char * executable) {
exit(1); exit(1);
} }
static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary); std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@ -160,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
} }
} }
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
// latest imatrix version contains the dataset filename at the end of the file
int m_last_call = 0;
if (in.peek() != EOF) {
in.read((char *)&m_last_call, sizeof(m_last_call));
int dataset_len;
in.read((char *)&dataset_len, sizeof(dataset_len));
std::vector<char> dataset_as_vec(dataset_len);
in.read(dataset_as_vec.data(), dataset_len);
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
}
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
return m_last_call;
} }
static void prepare_imatrix(const std::string & imatrix_file, static int prepare_imatrix(const std::string & imatrix_file,
std::string & imatrix_dataset,
const std::vector<std::string> & included_weights, const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights, const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) { std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
int m_last_call = -1;
if (!imatrix_file.empty()) { if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data); m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
} }
if (imatrix_data.empty()) { if (imatrix_data.empty()) {
return; return m_last_call;
} }
if (!excluded_weights.empty()) { if (!excluded_weights.empty()) {
for (auto& name : excluded_weights) { for (auto& name : excluded_weights) {
@ -197,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
if (!imatrix_data.empty()) { if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
} }
return m_last_call;
} }
static ggml_type parse_ggml_type(const char * arg) { static ggml_type parse_ggml_type(const char * arg) {
@ -211,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
return result; return result;
} }
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char* sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
usage(argv[0]); usage(argv[0]);
@ -316,10 +298,43 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
std::string imatrix_dataset;
std::unordered_map<std::string, std::vector<float>> imatrix_data; std::unordered_map<std::string, std::vector<float>> imatrix_data;
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) { if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data; params.imatrix = &imatrix_data;
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
if (!imatrix_dataset.empty()) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = imatrix_data.size();
kv_overrides.emplace_back(std::move(kvo));
}
if (m_last_call > 0) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = m_last_call;
kv_overrides.emplace_back(std::move(kvo));
}
} }
if (!kv_overrides.empty()) { if (!kv_overrides.empty()) {
kv_overrides.emplace_back(); kv_overrides.emplace_back();

View file

@ -74,15 +74,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- Using `make`: - Using `make`:
```bash ```bash
make make server
``` ```
- Using `CMake`: - Using `CMake`:
```bash ```bash
cmake --build . --config Release cmake -B build
cmake --build build --config Release -t server
``` ```
Binary is at `./build/bin/server`
## Build with SSL ## Build with SSL
`server` can also be built with SSL support using OpenSSL 3 `server` can also be built with SSL support using OpenSSL 3
@ -99,10 +102,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build -DLLAMA_SERVER_SSL=ON
cd build cmake --build build --config Release -t server
cmake .. -DLLAMA_SERVER_SSL=ON
make server
``` ```
## Quick Start ## Quick Start

View file

@ -268,6 +268,7 @@ def start_server_background(args):
server_args.extend(['--defrag-thold', "0.1"]) server_args.extend(['--defrag-thold', "0.1"])
server_args.append('--cont-batching') server_args.append('--cont-batching')
server_args.append('--metrics') server_args.append('--metrics')
server_args.append('--flash-attn')
server_args.extend(['--log-format', "text"]) server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]] args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")

View file

@ -90,7 +90,8 @@ export default function () {
"model": model, "model": model,
"stream": true, "stream": true,
"seed": 42, "seed": 42,
"max_tokens": max_tokens "max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
} }
const params = {method: 'POST', body: JSON.stringify(payload)}; const params = {method: 'POST', body: JSON.stringify(payload)};

View file

@ -1207,6 +1207,27 @@ struct server_context {
LOG_VERBOSE("eos token found", {}); LOG_VERBOSE("eos token found", {});
} }
auto n_ctx_train = llama_n_ctx_train(model);
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
&& slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
LOG_WARNING("n_predict is not set and self-context extend is disabled."
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
{ "id_slot", slot.id },
{ "params.n_predict", slot.params.n_predict },
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
{ "slot.n_decoded", slot.n_decoded },
{ "slot.n_predict", slot.n_predict },
{ "n_slots", params.n_parallel },
{ "slot.n_ctx", slot.n_ctx },
{ "n_ctx", n_ctx },
{ "n_ctx_train", n_ctx_train },
{ "ga_n", slot.ga_n },
});
slot.truncated = true;
slot.stopped_limit = true;
slot.has_next_token = false; // stop prediction
}
LOG_VERBOSE("next token", { LOG_VERBOSE("next token", {
{"id_slot", slot.id}, {"id_slot", slot.id},
{"id_task", slot.id_task}, {"id_task", slot.id_task},
@ -1362,9 +1383,10 @@ struct server_context {
if (!slot.params.stream && slot.stopped_word) { if (!slot.params.stream && slot.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
probs = std::vector<completion_token_output>( probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(), slot.generated_token_probs.begin(),
slot.generated_token_probs.end() - stop_word_toks.size()); slot.generated_token_probs.end() - safe_offset);
} else { } else {
probs = std::vector<completion_token_output>( probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(), slot.generated_token_probs.begin(),
@ -2141,7 +2163,7 @@ struct server_context {
}); });
// process the created batch of tokens // process the created batch of tokens
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
for (auto & slot : slots) { for (auto & slot : slots) {
@ -2332,7 +2354,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" disable KV offload\n"); printf(" disable KV offload\n");
} }
printf(" -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str()); printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
printf(" model download url (default: unused)\n"); printf(" model download url (default: unused)\n");
printf(" -hfr REPO, --hf-repo REPO\n"); printf(" -hfr REPO, --hf-repo REPO\n");
@ -2356,6 +2378,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" -ctk TYPE, --cache-type-k TYPE\n"); printf(" -ctk TYPE, --cache-type-k TYPE\n");
@ -2371,7 +2394,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n"); printf(" --chat-template JINJA_TEMPLATE\n");
@ -2721,6 +2744,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
params.embedding = true; params.embedding = true;
} else if (arg == "-cb" || arg == "--cont-batching") { } else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true; params.cont_batching = true;
} else if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
} else if (arg == "-np" || arg == "--parallel") { } else if (arg == "-np" || arg == "--parallel") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -2802,43 +2827,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
char * sep = strchr(argv[i], '='); if (!parse_kv_override(argv[i], params.kv_overrides)) {
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
break; break;
} }
params.kv_overrides.push_back(kvo);
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams); server_print_usage(argv[0], default_params, default_sparams);
@ -2846,6 +2839,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
} }
} }
gpt_params_handle_model_default(params);
if (!params.kv_overrides.empty()) { if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(); params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;

View file

@ -5,7 +5,7 @@ Feature: llama.cpp server
Background: Server startup Background: Server startup
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
And a model file ggml-model-f16.gguf And a model file bert-bge-small.gguf
And a model alias bert-bge-small And a model alias bert-bge-small
And 42 as server seed And 42 as server seed
And 2 slots And 2 slots

View file

@ -7,44 +7,16 @@ Feature: Results
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf And a model file test-model-00001-of-00003.gguf
And 128 as batch size And 128 as batch size
And 256 KV cache size And 1024 KV cache size
And 128 max tokens to predict And 128 max tokens to predict
Scenario Outline: Multi users completion
Given <n_slots> slots
And continuous batching And continuous batching
Scenario Outline: consistent results with same seed
Given <n_slots> slots
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy
Given 42 as seed Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given concurrent completion requests Given concurrent completion requests
Then the server is busy Then the server is busy
@ -55,3 +27,55 @@ Feature: Results
| n_slots | | n_slots |
| 1 | | 1 |
| 2 | | 2 |
Scenario Outline: different results with different seed
Given <n_slots> slots
Then the server is starting
Then the server is healthy
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are different
Examples:
| n_slots |
| 1 |
| 2 |
Scenario Outline: consistent results with same seed and varying batch size
Given 4 slots
And <temp> temperature
# And 0 as draft
Then the server is starting
Then the server is healthy
Given 1 prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_parallel | temp |
| 1 | 0.0 |
| 2 | 0.0 |
| 4 | 0.0 |
| 1 | 1.0 |
# FIXME: These tests fail on master. The problem seems to be the unified KV cache.
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
# | 2 | 1.0 |
# | 4 | 1.0 |

View file

@ -65,6 +65,7 @@ def step_server_config(context, server_fqdn, server_port):
context.server_seed = None context.server_seed = None
context.user_api_key = None context.user_api_key = None
context.response_format = None context.response_format = None
context.temperature = None
context.tasks_result = [] context.tasks_result = []
context.concurrent_tasks = [] context.concurrent_tasks = []
@ -232,15 +233,17 @@ async def step_all_slots_status(context, expected_slot_status_string):
@async_run_until_complete @async_run_until_complete
async def step_request_completion(context, api_error): async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1)
completion = await request_completion(context.prompts.pop(), completion = await request_completion(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.base_url, context.base_url,
debug=context.debug, debug=context.debug,
n_predict=context.n_predict, n_predict=context.n_predict,
cache_prompt=context.cache_prompt, cache_prompt=context.cache_prompt,
id_slot=context.id_slot, id_slot=context.id_slot,
seed=await completions_seed(context),
expect_api_error=expect_api_error, expect_api_error=expect_api_error,
user_api_key=context.user_api_key) user_api_key=context.user_api_key,
temperature=context.temperature)
context.tasks_result.append(completion) context.tasks_result.append(completion)
if context.debug: if context.debug:
print(f"Completion response: {completion}") print(f"Completion response: {completion}")
@ -269,6 +272,15 @@ async def step_predictions_equal(context):
context.tasks_result = [] context.tasks_result = []
@step('all predictions are different')
@async_run_until_complete
async def step_predictions_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_different(context.tasks_result)
context.tasks_result = []
@step('the completion is truncated') @step('the completion is truncated')
def step_assert_completion_truncated(context): def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '') step_assert_completion_truncated(context, '')
@ -311,6 +323,11 @@ def step_response_format(context, response_format):
context.response_format = json.loads(response_format) context.response_format = json.loads(response_format)
@step('{temperature:f} temperature')
def step_temperature(context, temperature):
context.temperature = temperature
@step('streaming is {enable_streaming}') @step('streaming is {enable_streaming}')
def step_streaming(context, enable_streaming): def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled' context.enable_streaming = enable_streaming == 'enabled'
@ -353,7 +370,10 @@ def step_n_ubatch(context, n_ubatch):
@step('{seed:d} as seed') @step('{seed:d} as seed')
def step_seed(context, seed): def step_seed(context, seed):
context.seed = seed if context.seed is None:
context.seed = [seed]
else:
context.seed.append(seed)
@step('a prefix prompt') @step('a prefix prompt')
@ -413,7 +433,9 @@ async def step_oai_chat_completions(context, api_error):
if context.debug: if context.debug:
print(f"Submitting OAI compatible completions request...") print(f"Submitting OAI compatible completions request...")
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1),
completion = await oai_chat_completions(context.prompts.pop(), completion = await oai_chat_completions(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.system_prompt, context.system_prompt,
context.base_url, context.base_url,
'/v1/chat', '/v1/chat',
@ -429,8 +451,6 @@ async def step_oai_chat_completions(context, api_error):
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None, if hasattr(context, 'user_api_key') else None,
@ -457,10 +477,21 @@ def step_a_prompt_prompt(context, prompt):
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
def step_many_prompts(context, num_prompts, prompt, seed):
if context.seed is None:
context.seed = []
for _ in range(num_prompts):
context.seed.append(seed)
context.prompts.append(prompt)
context.n_prompts = len(context.prompts)
@step('concurrent completion requests') @step('concurrent completion requests')
@async_run_until_complete() @async_run_until_complete()
async def step_concurrent_completion_requests(context): async def step_concurrent_completion_requests(context):
await concurrent_requests(context, await concurrent_requests(
context,
request_completion, request_completion,
# prompt is inserted automatically # prompt is inserted automatically
context.base_url, context.base_url,
@ -468,9 +499,9 @@ async def step_concurrent_completion_requests(context):
prompt_prefix=context.prompt_prefix, prompt_prefix=context.prompt_prefix,
prompt_suffix=context.prompt_suffix, prompt_suffix=context.prompt_suffix,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None, n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
user_api_key=context.user_api_key if hasattr(context, temperature=context.temperature,
'user_api_key') else None) )
@step('concurrent OAI completions requests') @step('concurrent OAI completions requests')
@ -490,7 +521,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -512,10 +542,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=context.seed
if hasattr(context, 'seed') else
context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -544,7 +570,7 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@async_run_until_complete @async_run_until_complete
async def step_compute_embedding(context): async def step_compute_embedding(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_embedding(context_text(context), base_url=context.base_url) context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
@step('all embeddings are the same') @step('all embeddings are the same')
@ -585,7 +611,7 @@ def step_assert_embeddings(context):
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings(context): async def step_oai_compute_embeddings(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context_text(context), context.embeddings = await request_oai_embeddings(context_text(context), None,
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
@ -594,7 +620,7 @@ async def step_oai_compute_embeddings(context):
@step('an OAI compatible embeddings computation request for multiple inputs') @step('an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context): async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts, context.embeddings = await request_oai_embeddings(context.prompts, None,
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
@ -740,8 +766,9 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
if context.debug: if context.debug:
print(f"starting {context.n_prompts} concurrent completion requests...") print(f"starting {context.n_prompts} concurrent completion requests...")
assert context.n_prompts > 0 assert context.n_prompts > 0
seeds = await completions_seed(context)
for prompt_no in range(context.n_prompts): for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args] shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
@ -781,6 +808,7 @@ def step_server_responds_with_status_code(context, status_code):
async def request_completion(prompt, async def request_completion(prompt,
seed,
base_url, base_url,
debug=False, debug=False,
prompt_prefix=None, prompt_prefix=None,
@ -788,9 +816,9 @@ async def request_completion(prompt,
n_predict=None, n_predict=None,
cache_prompt=False, cache_prompt=False,
id_slot=None, id_slot=None,
seed=None,
expect_api_error=None, expect_api_error=None,
user_api_key=None): user_api_key=None,
temperature=None):
if debug: if debug:
print(f"Sending completion request: {prompt}") print(f"Sending completion request: {prompt}")
origin = "my.super.domain" origin = "my.super.domain"
@ -811,7 +839,8 @@ async def request_completion(prompt,
"n_predict": n_predict if n_predict is not None else -1, "n_predict": n_predict if n_predict is not None else -1,
"cache_prompt": cache_prompt, "cache_prompt": cache_prompt,
"id_slot": id_slot, "id_slot": id_slot,
"seed": seed if seed is not None else 42 "seed": seed if seed is not None else 42,
"temperature": temperature if temperature is not None else "0.8f",
}, },
headers=headers, headers=headers,
timeout=3600) as response: timeout=3600) as response:
@ -824,6 +853,7 @@ async def request_completion(prompt,
async def oai_chat_completions(user_prompt, async def oai_chat_completions(user_prompt,
seed,
system_prompt, system_prompt,
base_url, base_url,
base_path, base_path,
@ -833,7 +863,6 @@ async def oai_chat_completions(user_prompt,
n_predict=None, n_predict=None,
enable_streaming=None, enable_streaming=None,
response_format=None, response_format=None,
seed=None,
user_api_key=None, user_api_key=None,
expect_api_error=None): expect_api_error=None):
if debug: if debug:
@ -952,7 +981,7 @@ async def oai_chat_completions(user_prompt,
return completion_response return completion_response
async def request_embedding(content, base_url=None): async def request_embedding(content, seed, base_url=None):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/embedding', async with session.post(f'{base_url}/embedding',
json={ json={
@ -963,7 +992,7 @@ async def request_embedding(content, base_url=None):
return [response_json['embedding']] return [response_json['embedding']]
async def request_oai_embeddings(input, async def request_oai_embeddings(input, seed,
base_url=None, user_api_key=None, base_url=None, user_api_key=None,
model=None, async_client=False): model=None, async_client=False):
# openai client always expects an api_key # openai client always expects an api_key
@ -1036,21 +1065,31 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}') f' {n_predicted} <> {expected_predicted_n}')
def assert_all_predictions_equal(completion_responses): def assert_all_predictions_equal(completion_responses):
content_0 = completion_responses[0]['content']
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content 0: {content_0}") for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i == content_j, "contents not equal"
i = 1
for response in completion_responses[1:]:
content = response['content']
def assert_all_predictions_different(completion_responses):
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content {i}: {content}") for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
assert content == content_0, "contents not equal" print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
i += 1 content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i != content_j, "contents not different"
async def gather_tasks_results(context): async def gather_tasks_results(context):
@ -1145,9 +1184,22 @@ def assert_slots_status(slots, expected_slots):
f" = {expected[key]} != {slot[key]}") f" = {expected[key]} != {slot[key]}")
async def completions_seed(context): async def completions_seed(context, num_seeds=None):
return context.seed if hasattr(context, 'seed') and context.seed is not None \ if hasattr(context, "seed") and context.seed is not None:
else context.server_seed if hasattr(context, 'server_seed') else None assert len(context.seed) == context.n_prompts
if num_seeds is None:
num_seeds = context.n_prompts
assert num_seeds <= context.n_prompts
seeds = context.seed[:num_seeds]
context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
return seeds
if hasattr(context, "server_seed") and context.server_seed is not None:
if num_seeds is None:
return [context.server_seed] * context.n_prompts
else:
return [context.server_seed] * num_seeds
return None
def context_text(context): def context_text(context):

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1713537308, "lastModified": 1714076141,
"narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=", "narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f", "rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
void ggml_backend_sched_reset(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run // reset state for the next run
if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size; size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
sched->is_reset = true; sched->is_reset = true;
}
sched->is_alloc = false; sched->is_alloc = false;
} }

View file

@ -14,6 +14,7 @@
#include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/dmmv.cuh" #include "ggml-cuda/dmmv.cuh"
#include "ggml-cuda/fattn.cuh"
#include "ggml-cuda/getrows.cuh" #include "ggml-cuda/getrows.cuh"
#include "ggml-cuda/im2col.cuh" #include "ggml-cuda/im2col.cuh"
#include "ggml-cuda/mmq.cuh" #include "ggml-cuda/mmq.cuh"
@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].nsm = prop.multiProcessorCount;
} }
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst);
break;
default: default:
return false; return false;
} }
@ -2564,6 +2569,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
return true; return true;
default: default:
return false; return false;

View file

@ -138,10 +138,12 @@
#define WARP_SIZE 32 #define WARP_SIZE 32
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
#define CC_PASCAL 600 #define CC_PASCAL 600
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700 #define CC_VOLTA 700
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000 #define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
@ -271,7 +273,6 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
return a; return a;
} }
#ifdef GGML_CUDA_F16
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll #pragma unroll
@ -284,7 +285,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
} }
#endif // GGML_CUDA_F16
static __device__ __forceinline__ float warp_reduce_max(float x) { static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll #pragma unroll
@ -294,19 +294,60 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
return x; return x;
} }
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
//#pragma unroll
// for (int mask = 16; mask > 0; mask >>= 1) {
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
// }
// return x;
//#else
// GGML_UNUSED(x);
// NO_DEVICE_CODE;
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//}
#if CUDART_VERSION >= CUDART_HMAX
return __hmax(a, b);
#else
return __half2float(a) > __half2float(b) ? a : b;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
@ -391,6 +432,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
} }
#endif // defined(GGML_USE_HIPBLAS) #endif // defined(GGML_USE_HIPBLAS)
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
// TODO: move to ggml-common.h // TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
@ -404,6 +450,7 @@ struct ggml_cuda_device_info {
struct cuda_device_info { struct cuda_device_info {
int cc; // compute capability int cc; // compute capability
int nsm; // number of streaming multiprocessors
size_t smpb; // max. shared memory per block size_t smpb; // max. shared memory per block
bool vmm; // virtual memory support bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory size_t vmm_granularity; // granularity of virtual memory

View file

@ -5,16 +5,16 @@
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x); const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
if (i >= k) { if (i >= k) {
return; return;
} }
const int64_t ib = i/qk; // block index const int64_t ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index const int64_t iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index const int64_t iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2; const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize // dequantize
dfloat2 v; dfloat2 v;
@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
#if __CUDA_ARCH__ >= CC_PASCAL #if __CUDA_ARCH__ >= CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
const int * x0 = ((int *) vx) + blockIdx.x * nint; const int * x0 = ((int *) vx) + blockIdx.x * nint;
half2 * y2 = (half2 *) (y + i0); half2 * y2 = (half2 *) (y + i0);
@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x; const int64_t i = blockIdx.x;
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int64_t ib = 8*i + ir; const int64_t ib = 8*i + ir;
if (ib >= nb32) { if (ib >= nb32) {
return; return;
@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x; const int64_t i = blockIdx.x;
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int64_t ib = 8*i + ir; const int64_t ib = 8*i + ir;
if (ib >= nb32) { if (ib >= nb32) {
return; return;
@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx; const block_q2_K * x = (const block_q2_K *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int n = tid/32; const int64_t n = tid/32;
const int l = tid - 32*n; const int64_t l = tid - 32*n;
const int is = 8*n + l/16; const int64_t is = 8*n + l/16;
const uint8_t q = x[i].qs[32*n + l]; const uint8_t q = x[i].qs[32*n + l];
dst_t * y = yy + i*QK_K + 128*n; dst_t * y = yy + i*QK_K + 128*n;
@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else #else
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const int il = tid%16; // 0...15 const int64_t il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is); const uint8_t q = x[i].qs[il] >> (2*is);
dst_t * y = yy + i*QK_K + 16*is + il; dst_t * y = yy + i*QK_K + 16*is + il;
float dall = __low2half(x[i].dm); float dall = __low2half(x[i].dm);
@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_q3_K * x = (const block_q3_K *) vx; const block_q3_K * x = (const block_q3_K *) vx;
#if QK_K == 256 #if QK_K == 256
const int r = threadIdx.x/4; const int64_t r = threadIdx.x/4;
const int tid = r/2; const int64_t tid = r/2;
const int is0 = r%2; const int64_t is0 = r%2;
const int l0 = 16*is0 + 4*(threadIdx.x%4); const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
const int n = tid / 4; const int64_t n = tid / 4;
const int j = tid - 4*n; const int64_t j = tid - 4*n;
uint8_t m = 1 << (4*n + j); uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0; int64_t is = 8*n + 2*j + is0;
int shift = 2*j; int shift = 2*j;
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const int il = tid%16; // 0...15 const int64_t il = tid%16; // 0...15
const int im = il/8; // 0...1 const int64_t im = il/8; // 0...1
const int in = il%8; // 0...7 const int64_t in = il%8; // 0...7
dst_t * y = yy + i*QK_K + 16*is + il; dst_t * y = yy + i*QK_K + 16*is + il;
@ -227,15 +227,15 @@ template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q4_K * x = (const block_q4_K *) vx; const block_q4_K * x = (const block_q4_K *) vx;
const int i = blockIdx.x; const int64_t i = blockIdx.x;
#if QK_K == 256 #if QK_K == 256
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; const int64_t il = tid/8;
const int ir = tid%8; const int64_t ir = tid%8;
const int is = 2*il; const int64_t is = 2*il;
const int n = 4; const int64_t n = 4;
dst_t * y = yy + i*QK_K + 64*il + n*ir; dst_t * y = yy + i*QK_K + 64*il + n*ir;
@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
y[l +32] = d2 * (q[l] >> 4) - m2; y[l +32] = d2 * (q[l] >> 4) - m2;
} }
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const uint8_t * q = x[i].qs; const uint8_t * q = x[i].qs;
dst_t * y = yy + i*QK_K; dst_t * y = yy + i*QK_K;
const float d = (float)x[i].dm[0]; const float d = (float)x[i].dm[0];
@ -268,14 +268,14 @@ template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q5_K * x = (const block_q5_K *) vx; const block_q5_K * x = (const block_q5_K *) vx;
const int i = blockIdx.x; const int64_t i = blockIdx.x;
#if QK_K == 256 #if QK_K == 256
// assume 64 threads - this is very slightly better than the one below // assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/16; // il is in 0...3 const int64_t il = tid/16; // il is in 0...3
const int ir = tid%16; // ir is in 0...15 const int64_t ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6 const int64_t is = 2*il; // is is in 0...6
dst_t * y = yy + i*QK_K + 64*il + 2*ir; dst_t * y = yy + i*QK_K + 64*il + 2*ir;
@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else #else
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const uint8_t q = x[i].qs[tid]; const uint8_t q = x[i].qs[tid];
const int im = tid/8; // 0...3 const int64_t im = tid/8; // 0...3
const int in = tid%8; // 0...7 const int64_t in = tid%8; // 0...7
const int is = tid/16; // 0 or 1 const int64_t is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im; const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d; const float d = x[i].d;
dst_t * y = yy + i*QK_K + tid; dst_t * y = yy + i*QK_K + tid;
@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_xxs * x = (const block_iq2_xxs *) vx; const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib; const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * aux8 = (const uint8_t *)q2; const uint8_t * aux8 = (const uint8_t *)q2;
@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_xs * x = (const block_iq2_xs *) vx; const block_iq2_xs * x = (const block_iq2_xs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib; const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq2_s * x = (const block_iq2_s *) vx; const block_iq2_s * x = (const block_iq2_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq3_xxs * x = (const block_iq3_xxs *) vx; const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * q3 = x[i].qs + 8*ib; const uint8_t * q3 = x[i].qs + 8*ib;
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq3_s * x = (const block_iq3_s *) vx; const block_iq3_s * x = (const block_iq3_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib; const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq1_s * x = (const block_iq1_s *) vx; const block_iq1_s * x = (const block_iq1_s *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq1_m * x = (const block_iq1_m *) vx; const block_iq1_m * x = (const block_iq1_m *) vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
#if QK_K == 256 #if QK_K == 256
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * sc = (const uint16_t *)x[i].scales; const uint16_t * sc = (const uint16_t *)x[i].scales;
iq1m_scale_t scale; iq1m_scale_t scale;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il; dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il; const uint8_t * q4 = x[ib].qs + 4*il;
const float d = (float)x[ib].d; const float d = (float)x[ib].d;
@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
#if QK_K != 64 #if QK_K != 64
template<typename dst_t> template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x; const int64_t i = blockIdx.x;
const block_iq4_xs * x = (const block_iq4_xs *)vx; const block_iq4_xs * x = (const block_iq4_xs *)vx;
const int tid = threadIdx.x; const int64_t tid = threadIdx.x;
const int il = tid/8; // 0...3 const int64_t il = tid/8; // 0...3
const int ib = tid%8; // 0...7 const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il; dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il; const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);

944
ggml-cuda/fattn.cu Normal file
View file

@ -0,0 +1,944 @@
#include "common.cuh"
#include "fattn.cuh"
#include <cstdint>
#if FP16_MMA_AVAILABLE
#include <mma.h>
#endif
#define FATTN_KQ_STRIDE 256
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
template<int D, int parallel_blocks> // D == head size
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nwarps*WARP_SIZE);
__shared__ half KQ[nwarps*WARP_SIZE];
KQ[tid] = -INFINITY;
half2 * KQ2 = (half2 *) KQ;
half kqmax = -HALF_MAX_HALF;
half kqsum = 0.0f;
__shared__ half kqmax_shared[WARP_SIZE];
__shared__ half kqsum_shared[WARP_SIZE];
if (threadIdx.y == 0) {
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[threadIdx.x] = 0.0f;
}
__syncthreads();
// Convert Q to half2 and store in registers:
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break;
}
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
}
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new = kqmax;
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
half2 sum2 = make_half2(0.0f, 0.0f);
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
break;
}
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
}
sum2 = warp_reduce_sum(sum2);
half sum = __low2half(sum2) + __high2half(sum2);
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
if (threadIdx.x == 0) {
KQ[i_KQ] = sum;
}
}
kqmax_new = warp_reduce_max(kqmax_new);
if (threadIdx.x == 0) {
kqmax_shared[threadIdx.y] = kqmax_new;
}
__syncthreads();
kqmax_new = kqmax_shared[threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const half KQ_max_scale = hexp(kqmax - kqmax_new);
kqmax = kqmax_new;
const half val = hexp(KQ[tid] - kqmax);
kqsum = kqsum*KQ_max_scale + val;
KQ[tid] = val;
VKQ *= __half2half2(KQ_max_scale);
__syncthreads();
if (tid < D) {
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
VKQ += V_k*KQ2[k0/2];
}
}
__syncthreads();
}
if (tid >= D) {
kqsum = 0.0f;
}
kqsum = warp_reduce_sum(kqsum);
if (threadIdx.x == 0) {
kqsum_shared[threadIdx.y] = kqsum;
}
__syncthreads();
kqsum = kqsum_shared[threadIdx.x];
kqsum = warp_reduce_sum(kqsum);
if (tid >= D) {
return;
}
half dst_val = (__low2half(VKQ) + __high2half(VKQ));
if (parallel_blocks == 1) {
dst_val /= kqsum;
}
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
if (parallel_blocks == 1 || tid != 0) {
return;
}
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
__launch_bounds__(nwarps*WARP_SIZE, 1)
static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
#if FP16_MMA_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
constexpr int frag_m = ncols == 8 ? 32 : 16;
constexpr int frag_n = ncols == 8 ? 8 : 16;
static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, frag_m, frag_n, 16, half, nvcuda::wmma::row_major> frag_a_K;
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_a_V;
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t> frag_c_KQ;
typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half> frag_c_VKQ;
constexpr int KQ_stride_tc = nwarps*frag_m; // Number of KQ rows calculated in parallel.
constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps.");
// Pad internal representation of KQ, KQV to reduce shared memory bank conflicts:
constexpr int D_padded = D + 8;
constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float * Q_f = (const float *) (Q + nb02* blockIdx.y + nb01*ic0);
const half * K_h = (const half *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0;
const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2);
const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half);
frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
constexpr int mem_KQ = ncols*kqs_padded*kqar;
constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded;
__shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts];
float * KQ_f = (float *) KQ;
half2 * KQ2 = (half2 *) KQ;
float KQ_rowsum_f[ncols/nwarps] = {0.0f};
float KQ_max_f[ncols/nwarps];
float KQ_max_scale_f[ncols/nwarps] = {0.0f};
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
KQ_max_f[j] = -FLT_MAX/2.0f;
}
half2 KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}};
half2 KQ_max_h2[ncols/nwarps];
half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}};
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF);
}
__shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
half2 * VKQ2 = (half2 *) VKQ;
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break;
}
VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f);
}
}
// Convert Q to half and apply scale, temporarily store in KQ:
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D && i >= D) {
break;
}
KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
}
}
__syncthreads();
// Load Q into tensor core fragments/registers since it will be used frequently:
#pragma unroll
for (int i0 = 0; i0 < D; i0 += 16) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
}
}
__syncthreads();
// Iterate over ne11 == previous tokens:
for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
// Calculate tile of KQ:
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
frag_c_KQ KQ_c[ncols/frag_n];
#pragma unroll
for (int j = 0; j < ncols/frag_n; ++j) {
nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
}
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
frag_a_K K_a;
nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
#pragma unroll
for (int j = 0; j < ncols/frag_n; ++j) {
nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
}
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major);
}
}
__syncthreads();
// Calculate softmax for each KQ column using the current max. value.
// The divisor is stored in KQ_rowsum and will be applied at the end.
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (std::is_same<KQ_acc_t, float>::value) {
float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE];
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
}
float KQ_max_new = KQ_max_f[j0/nwarps];
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
}
KQ_max_new = warp_reduce_max(KQ_max_new);
const float diff = KQ_max_f[j0/nwarps] - KQ_max_new;
KQ_max_scale_f[j0/nwarps] = expf(diff);
if (diff <= SOFTMAX_FTZ_THRESHOLD) {
KQ_max_scale_f[j0/nwarps] = 0.0f;
}
KQ_max_f[j0/nwarps] = KQ_max_new;
float KQ_rowsum_add = 0.0f;
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps];
KQ_f_tmp[k0/WARP_SIZE] = expf(diff);
if (diff <= SOFTMAX_FTZ_THRESHOLD) {
KQ_f_tmp[k0/WARP_SIZE] = 0.0f;
}
KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE];
KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE];
}
KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add;
} else {
half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)];
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
}
half2 KQ_max_new = KQ_max_h2[j0/nwarps];
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
}
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
*((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask;
KQ_max_h2[j0/nwarps] = KQ_max_new;
half2 KQ_rowsum_add = make_half2(0.0f, 0.0f);
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps];
KQ2_tmp[k0/WARP_SIZE] = h2exp(diff);
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
*((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask;
KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE];
KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE];
}
KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add;
}
}
__syncthreads();
frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
nvcuda::wmma::load_matrix_sync(
KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
KQ + j0*(kqar*kqs_padded) + k,
kqar*kqs_padded);
}
}
frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n];
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) {
#pragma unroll
for (int j = 0; j < ncols/frag_n; ++j) {
nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f);
}
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
frag_a_V v_a;
nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
#pragma unroll
for (int j = 0; j < ncols/frag_n; ++j) {
nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
}
}
}
__syncthreads();
const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded);
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += frag_n) {
nvcuda::wmma::store_matrix_sync(
KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
D_padded, nvcuda::wmma::mem_col_major);
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
half2 VKQ_scale;
if (std::is_same<KQ_acc_t, float>::value) {
VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]);
} else {
VKQ_scale = KQ_max_scale_h2[j0/nwarps];
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break;
}
half2 VKQ_add = make_half2(0.0f, 0.0f);
#pragma unroll
for (int l = 0; l < VKQ_ratio; ++l) {
VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i];
}
VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add;
}
}
__syncthreads();
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j_VKQ = j0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
float KQ_rowsum_j;
if (std::is_same<KQ_acc_t, float>::value) {
KQ_rowsum_j = KQ_rowsum_f[j0/nwarps];
} else {
KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
}
#pragma unroll
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D && i >= D) {
break;
}
float dst_val = VKQ[j_VKQ*D_padded + i];
if (parallel_blocks == 1) {
dst_val /= KQ_rowsum_j;
}
dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
}
if (parallel_blocks == 1 || threadIdx.x != 0) {
continue;
}
float2 dst_meta_val;
if (std::is_same<KQ_acc_t, float>::value) {
dst_meta_val.x = KQ_max_f[j0/nwarps];
} else {
dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
}
dst_meta_val.y = KQ_rowsum_j;
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
}
#else
NO_DEVICE_CODE;
#endif // FP16_MMA_AVAILABLE
}
template<int D, int parallel_blocks> // D == head size
__launch_bounds__(D, 1)
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
float * __restrict__ dst) {
#if FP16_AVAILABLE
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
dst += D * gridDim.y*blockIdx.x;
const int tid = threadIdx.x;
__builtin_assume(tid < D);
__shared__ float2 meta[parallel_blocks];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
}
__syncthreads();
float kqmax = meta[0].x;
#pragma unroll
for (int l = 1; l < parallel_blocks; ++l) {
kqmax = max(kqmax, meta[l].x);
}
float VKQ_numerator = 0.0f;
float VKQ_denominator = 0.0f;
#pragma unroll
for (int l = 0; l < parallel_blocks; ++l) {
const float diff = meta[l].x - kqmax;
const float KQ_max_scale = expf(diff);
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
VKQ_denominator += KQ_max_scale * meta[l].y;
}
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
constexpr int get_max_power_of_2(int x) {
return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
}
static_assert(get_max_power_of_2(1) == 1, "Test failed.");
static_assert(get_max_power_of_2(2) == 2, "Test failed.");
static_assert(get_max_power_of_2(4) == 4, "Test failed.");
static_assert(get_max_power_of_2(6) == 2, "Test failed.");
// Number of VKQ rows calculated in parallel:
constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) {
return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m;
}
static_assert(get_VKQ_stride(128, 1, 32) == 32, "Test failed.");
static_assert(get_VKQ_stride(128, 2, 32) == 64, "Test failed.");
static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed.");
static_assert(get_VKQ_stride( 64, 1, 32) == 32, "Test failed.");
static_assert(get_VKQ_stride( 64, 2, 32) == 64, "Test failed.");
static_assert(get_VKQ_stride( 64, 4, 32) == 64, "Test failed.");
static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int parallel_blocks> void launch_fattn_vec_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream
) {
ggml_cuda_pool_alloc<float> dst_tmp(pool);
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
if (parallel_blocks > 1) {
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
}
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
flash_attn_vec_ext_f16<D, parallel_blocks>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
K->nb[1], K->nb[2], K->nb[3],
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
);
CUDA_CHECK(cudaGetLastError());
if (parallel_blocks == 1) {
return;
}
const dim3 block_dim_combine(D, 1, 1);
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
const int shmem_combine = 0;
flash_attn_combine_results<D, parallel_blocks>
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
CUDA_CHECK(cudaGetLastError());
}
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream
) {
ggml_cuda_pool_alloc<float> dst_tmp(pool);
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
if (parallel_blocks > 1) {
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
}
constexpr int frag_m = (cols_per_block) == 8 && (D) % 32 == 0 ? 32 : 16;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
K->nb[1], K->nb[2], K->nb[3],
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
);
CUDA_CHECK(cudaGetLastError());
if ((parallel_blocks) == 1) {
return;
}
const dim3 block_dim_combine(D, 1, 1);
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
const int shmem_combine = 0;
flash_attn_combine_results<D, parallel_blocks>
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
CUDA_CHECK(cudaGetLastError());
}
template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_fattn_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
const int nsm, ggml_cuda_pool & pool, cudaStream_t main_stream
) {
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
if (4*blocks_num_pb1 < 2*nsm) {
launch_fattn_f16_impl<D, cols_per_block, nwarps, 4, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
return;
}
if (2*blocks_num_pb1 < 2*nsm) {
launch_fattn_f16_impl<D, cols_per_block, nwarps, 2, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
return;
}
launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
}
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const ggml_tensor * mask = dst->src[3];
ggml_tensor * KQV = dst;
GGML_ASSERT(Q->type == GGML_TYPE_F32);
GGML_ASSERT(K->type == GGML_TYPE_F16);
GGML_ASSERT(V->type == GGML_TYPE_F16);
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
ggml_cuda_set_device(ctx.device);
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
const int32_t precision = KQV->op_params[1];
if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
constexpr int cols_per_block = 16;
constexpr int nwarps = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 112:
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
} else {
constexpr int cols_per_block = 32;
constexpr int nwarps = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 112:
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
// case 256:
// launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
// break;
default:
GGML_ASSERT(false);
break;
}
}
return;
}
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
constexpr int cols_per_block = 8;
constexpr int nwarps = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 16;
constexpr int nwarps = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 112:
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
constexpr int cols_per_block = 32;
constexpr int nwarps = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 112:
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}

3
ggml-cuda/fattn.cuh Normal file
View file

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -1,7 +1,17 @@
#include "softmax.cuh" #include "softmax.cuh"
template <bool vals_smem, int ncols_template, int block_size_template> template <typename T>
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { static __device__ __forceinline__ float t2f32(T val) {
return (float) val;
}
template <>
__device__ float __forceinline__ t2f32<half>(half val) {
return __half2float(val);
}
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x; const int tid = threadIdx.x;
@ -28,7 +38,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
extern __shared__ float data_soft_max_f32[]; extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations: // shared memory buffer to cache values between iterations:
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols; float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
float max_val = -INFINITY; float max_val = -INFINITY;
@ -40,10 +50,10 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
break; break;
} }
const int ix = rowx*ncols + col; const int64_t ix = (int64_t)rowx*ncols + col;
const int iy = rowy*ncols + col; const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f);
vals[col] = val; vals[col] = val;
max_val = max(max_val, val); max_val = max(max_val, val);
@ -109,12 +119,13 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
return; return;
} }
const int idst = rowx*ncols + col; const int64_t idst = (int64_t)rowx*ncols + col;
dst[idst] = vals[col] * inv_sum; dst[idst] = vals[col] * inv_sum;
} }
} }
static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { template<typename T>
static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
int nth = WARP_SIZE; int nth = WARP_SIZE;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1); const dim3 block_dims(nth, 1, 1);
@ -167,15 +178,19 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const float * src1_d = src1 ? (const float *)src1->data : nullptr; const void * src1_d = src1 ? (const void *)src1->data : nullptr;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -188,14 +203,25 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor // positions tensor
float * src2_dd = nullptr; void * src2_d = nullptr;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr; const bool use_src2 = src2 != nullptr;
if (use_src2) { if (use_src2) {
src2_dd = (float *)src2->data; src2_d = (void *)src2->data;
} }
soft_max_f32_cuda(src0_d, src1_d, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
if (use_f16) {
const half * src1_dd = (const half *)src1_d;
const half * src2_dd = (const half *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else {
const float * src1_dd = (const float *)src1_d;
const float * src2_dd = (const float *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
}
} }

View file

@ -313,7 +313,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif // defined(__ARM_NEON) #endif // defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(__MSC_VER) #if defined(__ARM_NEON) && !defined(_MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

View file

@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
for (int i = node_start; i < node_end; ++i) { for (int i = node_start; i < node_end; ++i) {
struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * src1 = gf->nodes[i]->src[1];
struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
struct ggml_tensor * dst = gf->nodes[i]; struct ggml_tensor * dst = gf->nodes[i];
GGML_ASSERT(dst->data != nullptr); GGML_ASSERT(dst->data != nullptr);
@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
{ {
float scale; float scale;
memcpy(&scale, dst->op_params, sizeof(float)); memcpy(&scale, dst->op_params, sizeof(float));
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
GGML_ASSERT(src2 == nullptr);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
} break; } break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:

View file

@ -46,8 +46,10 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
GGML_METAL_KERNEL_TYPE_SILU, GGML_METAL_KERNEL_TYPE_SILU,
GGML_METAL_KERNEL_TYPE_SILU_4, GGML_METAL_KERNEL_TYPE_SILU_4,
GGML_METAL_KERNEL_TYPE_SOFT_MAX, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
@ -177,6 +179,14 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
@ -443,7 +453,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
} }
/* /*
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \ (int) kernel->pipeline.threadExecutionWidth); \
*/ */
@ -459,7 +469,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
return NULL; \ return NULL; \
} \ } \
} else { \ } else { \
GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \ GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
} }
// simd_sum and simd_max requires MTLGPUFamilyApple7 // simd_sum and simd_max requires MTLGPUFamilyApple7
@ -481,8 +491,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true);
@ -612,6 +624,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
@ -743,6 +763,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
return true; return true;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
@ -1326,20 +1347,33 @@ static enum ggml_status ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
int nth = 32; // SIMD width int nth = 32; // SIMD width
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
if (ne00%4 == 0) { if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) { while (nth < ne00/4 && nth < 256) {
nth *= 2; nth *= 2;
} }
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; if (use_f16) {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline;
} else {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
}
} else { } else {
while (nth < ne00 && nth < 1024) { while (nth < ne00 && nth < 1024) {
nth *= 2; nth *= 2;
} }
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; if (use_f16) {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline;
} else {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline;
}
} }
float scale; float scale;
@ -2503,6 +2537,161 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_FLASH_ATTN_EXT:
{
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
GGML_ASSERT(ggml_are_same_shape(src1, src2));
GGML_ASSERT(src3);
size_t offs_src3 = 0;
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
const uint64_t nb31 = src3 ? src3->nb[1] : 0;
const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
float scale;
memcpy(&scale, dst->op_params, sizeof(float));
id<MTLComputePipelineState> pipeline = nil;
bool use_vec_kernel = false;
if (ne01 >= 4 || (ne00%128 != 0)) {
switch (ne00) {
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
default:
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size");
}
}
} else {
use_vec_kernel = true;
switch (ne00) {
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
default:
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size");
}
}
}
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
if (!use_vec_kernel) {
// half8x8 kernel
const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !!
const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
GGML_ASSERT(nqptg <= 32);
GGML_ASSERT(nqptg % 8 == 0);
GGML_ASSERT(ncpsg % 32 == 0);
int64_t nsgmax = 2;
while (true) {
const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2);
if (smem > ctx->device.maxThreadgroupMemoryLength) {
break;
}
nsgmax *= 2;
}
nsgmax /= 2;
// simdgroups per threadgroup (a.k.a. warps)
const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2);
//printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
[encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
} else {
// half1x4 kernel
const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !!
const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
GGML_ASSERT(nqptg <= 32);
GGML_ASSERT(nqptg % 1 == 0);
GGML_ASSERT(ncpsg % 32 == 0);
// simdgroups per threadgroup (a.k.a. warps)
const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32));
int64_t nsg = 1;
while (nsg <= nsgt) {
nsg *= 2;
}
nsg /= 2;
const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2);
//printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
[encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
}
} break;
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -2590,6 +2779,11 @@ static enum ggml_status ggml_metal_graph_compute(
MTLCommandBufferStatus status = [command_buffer status]; MTLCommandBufferStatus status = [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
if (status == MTLCommandBufferStatusError) {
NSString * error_code = [command_buffer error].localizedDescription;
GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
}
return GGML_STATUS_FAILED; return GGML_STATUS_FAILED;
} }
} }
@ -2706,10 +2900,13 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe
UNUSED(buft); UNUSED(buft);
} }
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) { static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
#ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) { if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
__func__,
size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0,
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -2719,10 +2916,15 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
} else { } else {
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
__func__,
size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0);
} }
#endif
#endif #endif
UNUSED(device); UNUSED(device);
UNUSED(size_aligned);
} }
GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -2756,8 +2958,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
return NULL; return NULL;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); //ggml_backend_metal_log_allocated_size(device, size_aligned);
ggml_backend_metal_log_allocated_size(device);
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
} }
@ -2844,7 +3045,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); ggml_backend_metal_log_allocated_size(device, size_aligned);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -2867,7 +3068,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); ggml_backend_metal_log_allocated_size(device, size_step_aligned);
if (i + size_step < size) { if (i + size_step < size) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
@ -2876,8 +3078,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
} }
} }
ggml_backend_metal_log_allocated_size(device);
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
} }

View file

@ -352,11 +352,12 @@ kernel void kernel_sum_rows(
dst_row[0] = row_sum; dst_row[0] = row_sum;
} }
template<typename T>
kernel void kernel_soft_max( kernel void kernel_soft_max(
device const float * src0, device const char * src0,
device const float * src1, device const char * src1,
device const float * src2, device const char * src2,
device float * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
@ -375,10 +376,10 @@ kernel void kernel_soft_max(
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
device const float * ppos = src2 != src0 ? src2 : nullptr; device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
float slope = 0.0f; float slope = 0.0f;
@ -456,11 +457,12 @@ kernel void kernel_soft_max(
} }
} }
template<typename T>
kernel void kernel_soft_max_4( kernel void kernel_soft_max_4(
device const float * src0, device const char * src0,
device const float * src1, device const char * src1,
device const float * src2, device const char * src2,
device float * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
@ -479,10 +481,10 @@ kernel void kernel_soft_max_4(
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
float slope = 0.0f; float slope = 0.0f;
@ -499,7 +501,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY; float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)));
} }
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@ -525,7 +527,7 @@ kernel void kernel_soft_max_4(
// parallel sum // parallel sum
float4 lsum4 = 0.0f; float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val);
lsum4 += exp_psrc4; lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4; pdst4[i00] = exp_psrc4;
} }
@ -562,6 +564,14 @@ kernel void kernel_soft_max_4(
} }
} }
typedef decltype(kernel_soft_max<float>) kernel_soft_max_t;
typedef decltype(kernel_soft_max_4<float4>) kernel_soft_max_4_t;
template [[host_name("kernel_soft_max_f16")]] kernel kernel_soft_max_t kernel_soft_max<half>;
template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kernel_soft_max<float>;
template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<half4>;
template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;
kernel void kernel_diag_mask_inf( kernel void kernel_diag_mask_inf(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -2084,6 +2094,632 @@ kernel void kernel_leaky_relu_f32(
dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope; dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
} }
typedef void (flash_attn_ext_f16_t)(
device const char * q,
device const char * k,
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]);
// ref: https://arxiv.org/pdf/2307.08691.pdf
template<int64_t D, int64_t Q = 8, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
kernel void kernel_flash_attn_ext_f16(
device const char * q,
device const char * k,
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
const short nsg = ntg.y; // number of simdgroups
const short iq3 = tgpig[2];
const short iq2 = tgpig[1];
const short iq1 = tgpig[0]*Q;
const short D4 = D/4;
const short D8 = D/8;
const short Q8 = Q/8;
const short NW = N_SIMDWIDTH;
const short SH = (C + Q); // shared memory per simdgroup in (half)
const short T = D + 2*nsg*SH; // shared memory size per query in (half)
const short TF = T/2; // shared memory size per query in (float)
const short T4 = T/4; // shared memory size per query in (half4)
threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
// store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
simdgroup_half8x8 lo[D8];
// load heads from Q to shared memory
for (short j = sgitg; j < Q; j += nsg) {
device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
for (short i = tiisg; i < D4; i += NW) {
if (iq1 + j < ne01) {
sq4[j*T4 + i] = (half4) q4[i];
} else {
sq4[j*T4 + i] = 0.0h;
}
}
}
// zero out lo
for (short i = 0; i < D8; ++i) {
lo[i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
}
// zero out shared memory SH
for (short j = 0; j < Q; ++j) {
for (short i = tiisg; i < SH; i += NW) {
ss[j*TF + i] = 0.0f;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
{
float S[Q] = { [0 ... Q-1] = 0.0h };
float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 };
// assume K and V are same shape
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
const short rv2 = ne02/ne22;
const short rv3 = ne03/ne23;
// k indices
const short ik2 = iq2/rk2;
const short ik3 = iq3/rk3;
// v indices
const short iv2 = iq2/rv2;
const short iv3 = iq3/rv3;
// load the queries from shared memory into local memory
simdgroup_half8x8 mq[D8];
for (short i = 0; i < D8; ++i) {
simdgroup_load(mq[i], sq + i*8, T);
}
// pointer to the mask
device const half * mp = (device const half *) (mask + iq1*nb31);
// prepare diagonal scale matrix
simdgroup_float8x8 mscale(scale);
// loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
const int ic = ic0 + C*sgitg;
if (ic >= ne11) {
break;
}
// Q*K^T
{
for (short cc = 0; cc < C/8; ++cc) {
simdgroup_float8x8 mqk = make_filled_simdgroup_matrix<float, 8>(0.h);
device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
for (short i = 0; i < D8; ++i) {
simdgroup_half8x8 mk;
simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
}
// mqk = mqk*scale + mask
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
}
}
// used to detect blocks full of -INF
float smax = -INFINITY;
// online softmax
{
float ms[Q];
for (short j = 0; j < Q; ++j) {
const short p = tiisg;
const float m = M[j];
const float s = ss[j*TF + p];
smax = simd_max(max(smax, s));
M[j] = simd_max(max(M[j], s));
ms[j] = exp(m - M[j]);
const float vs = exp(s - M[j]);
S[j] = S[j]*ms[j] + simd_sum(vs);
// the P matrix from the paper (Q rows, C columns)
ss[j*TF + p] = vs;
}
// create a QxQ diagonal matrix for rescaling the output
if (tiisg < Q) {
ss[tiisg*TF + C + tiisg] = ms[tiisg];
}
}
// skip -INF blocks
if (smax == -INFINITY) {
continue;
}
// O = diag(ms)*O
{
simdgroup_float8x8 mm;
simdgroup_load(mm, ss + C, TF, 0, false);
for (short i = 0; i < D8; ++i) {
simdgroup_multiply(lo[i], mm, lo[i]);
}
}
// O = O + (Q*K^T)*V
{
for (short cc = 0; cc < C/8; ++cc) {
device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
for (short i = 0; i < D8; ++i) {
simdgroup_half8x8 mk;
simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
simdgroup_float8x8 mv;
simdgroup_load(mv, ss + 8*cc, TF, 0, false);
simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]);
}
}
}
}
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
for (short j = 0; j < Q; ++j) {
if (tiisg == 0) {
ss[j*TF + 0] = S[j];
ss[j*TF + 1] = M[j];
}
}
}
// reduce the warps sequentially
for (short sg = 1; sg < nsg; ++sg) {
float S = { 0.0h };
float M = { -FLT_MAX/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
// each simdgroup stores its output to shared memory, reusing sq
if (sgitg == sg) {
for (short i = 0; i < D8; ++i) {
simdgroup_store(lo[i], sq + i*8, T, 0, false);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// the first simdgroup accumulates the results from the other simdgroups
if (sgitg == 0) {
for (short j = 0; j < Q; ++j) {
const float S0 = ss[j*TF + 0];
const float S1 = ss[j*TF + sg*SH + 0];
const float M0 = ss[j*TF + 1];
const float M1 = ss[j*TF + sg*SH + 1];
M = max(M0, M1);
const float ms0 = exp(M0 - M);
const float ms1 = exp(M1 - M);
S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[j*TF + 0] = S;
ss[j*TF + 1] = M;
ss[j*TF + C + j ] = ms0;
ss[j*TF + C + j + sg*SH] = ms1;
}
}
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
{
simdgroup_half8x8 t;
simdgroup_float8x8 ms0;
simdgroup_float8x8 ms1;
simdgroup_load(ms0, ss + C, TF, 0, false);
simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false);
for (short i = 0; i < D8; ++i) {
simdgroup_load (t, sq + i*8, T, 0, false);
simdgroup_multiply(t, ms1, t);
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
}
}
}
}
// store result to shared memory (reuse sq)
if (sgitg == 0) {
for (short i = 0; i < D8; ++i) {
simdgroup_store(lo[i], sq + i*8, T, 0, false);
}
}
device float4 * dst4 = (device float4 *) dst;
// final rescale with 1/S and store to global memory
if (sgitg == 0) {
for (short j = 0; j < Q && iq1 + j < ne01; ++j) {
const float S = ss[j*TF + 0];
for (short i = tiisg; i < D4; i += NW) {
dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
}
}
}
}
template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>;
template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>;
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
kernel void kernel_flash_attn_ext_vec_f16(
device const char * q,
device const char * k,
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
const short nsg = ntg.y; // number of simdgroups
const short iq3 = tgpig[2];
const short iq2 = tgpig[1];
const short iq1 = tgpig[0];
const short D4 = D/4;
const short NW = N_SIMDWIDTH;
const short SH = (C + Q); // shared memory per simdgroup in (half)
const short T = D + 2*nsg*SH; // shared memory size per query in (half)
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4
threadgroup half4 * sr4 = (threadgroup half4 *) (shared + sgitg*D + 1*T); // scratch buffer for the results
// store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
half4 lo[D4/NW];
// load heads from Q to shared memory
device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
for (short i = tiisg; i < D4; i += NW) {
if (iq1 < ne01) {
sq4[i] = (half4) q4[i];
} else {
sq4[i] = 0.0h;
}
}
// zero out lo
for (short i = tiisg; i < D4; i += NW) {
lo[i/NW] = 0.0h;
}
// zero out shared memory SH
for (short i = tiisg; i < SH/4; i += NW) {
ss4[i] = 0.0h;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
{
float S = { 0.0h };
float M = { -FLT_MAX/2 };
// assume K and V are same shape
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
const short rv2 = ne02/ne22;
const short rv3 = ne03/ne23;
// k indices
const short ik2 = iq2 / rk2;
const short ik3 = iq3 / rk3;
// v indices
const short iv2 = iq2 / rv2;
const short iv3 = iq3 / rv3;
// load the queries from shared memory into local memory
half4 mq[D4];
for (short ii = 0; ii < D4; ii += NW) {
short i = ii + tiisg;
mq[i] = sq4[i];
}
// pointer to the mask
device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31);
// loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
const int ic = ic0 + C*sgitg;
if (ic >= ne11) {
break;
}
// Q*K^T
{
#pragma unroll
for (short cc = 0; cc < C/4; ++cc) {
float4 mqk = { 0.0h };
device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13));
#pragma unroll
for (short ii = 0; ii < D4; ii += NW) {
const short i = ii + tiisg;
half4x4 mk;
mk[0] = pk4[i + 0*(nb11/8)];
mk[1] = pk4[i + 1*(nb11/8)];
mk[2] = pk4[i + 2*(nb11/8)];
mk[3] = pk4[i + 3*(nb11/8)];
mqk += (float4) (mq[i] * mk);
}
// reduce the results from the threads in the simdgroup
mqk += simd_shuffle_down(mqk, 16);
mqk += simd_shuffle_down(mqk, 8);
mqk += simd_shuffle_down(mqk, 4);
mqk += simd_shuffle_down(mqk, 2);
mqk += simd_shuffle_down(mqk, 1);
// mqk = mqk*scale + mask
if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc];
mqk = mqk*scale + mm;
ss4[cc] = mqk;
}
}
}
// online softmax
{
const short p = tiisg;
const float m = M;
const float s = ss[p];
M = simd_max(max(M, s));
const float ms = exp(m - M);
const float vs = exp(s - M);
S = S*ms + simd_sum(vs);
// the P matrix from the paper (Q rows, C columns)
ss[p] = vs;
// O = diag(ms)*O
#pragma unroll
for (short ii = 0; ii < D4; ii += NW) {
const short i = ii + tiisg;
lo[i/NW] *= ms;
}
}
// O = O + (Q*K^T)*V
{
#pragma unroll
for (short cc = 0; cc < C/4; ++cc) {
device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23));
#pragma unroll
for (short ii = 0; ii < D4; ii += NW) {
const short i = ii + tiisg;
lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
}
}
}
}
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
if (tiisg == 0) {
ss[0] = S;
ss[1] = M;
}
}
// store results to shared memory
for (short ii = 0; ii < D4; ii += NW) {
short i = ii + tiisg;
sr4[i] = lo[ii/NW];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// parallel reduce
for (short r = nsg/2; r > 0; r >>= 1) {
if (sgitg < r) {
const float S0 = ss[ 0];
const float S1 = ss[r*SH + 0];
const float M0 = ss[ 1];
const float M1 = ss[r*SH + 1];
const float M = max(M0, M1);
const float ms0 = exp(M0 - M);
const float ms1 = exp(M1 - M);
const float S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[0] = S;
ss[1] = M;
}
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
for (short ii = 0; ii < D4; ii += NW) {
short i = ii + tiisg;
sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
device float4 * dst4 = (device float4 *) dst;
// final rescale with 1/S and store to global memory
if (sgitg == 0) {
const float S = ss[0];
for (short ii = 0; ii < D4; ii += NW) {
short i = ii + tiisg;
dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S;
}
}
}
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
kernel void kernel_cpy_f16_f16( kernel void kernel_cpy_f16_f16(
device const half * src0, device const half * src0,
device half * dst, device half * dst,

View file

@ -12383,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
block_iq2_s * restrict y = vy; block_iq2_s * restrict y = vy;
quantize_row_iq2_s_reference(x, y, k); quantize_row_iq2_s_reference(x, y, k);
} }
static bool validate_float(float f, size_t i) {
if (isinf(f)) {
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
return false;
}
if (isnan(f)) {
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
return false;
}
return true;
}
static bool isinf_fp16(ggml_fp16_t f) {
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
}
static bool isnan_fp16(ggml_fp16_t f) {
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
}
static bool validate_fp16(ggml_fp16_t f, size_t i) {
if (isinf_fp16(f)) {
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
return false;
}
if (isnan_fp16(f)) {
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
return false;
}
return true;
}
#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
if (!validate_fp16(q[i].d, i)) { \
return false; \
} \
}
#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
return false; \
} \
}
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
if (type < 0 || type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
return false;
}
if (nbytes % ggml_type_size(type) != 0) {
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
return false;
}
const size_t nb = nbytes/ggml_type_size(type);
switch (type) {
case GGML_TYPE_F16:
{
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
size_t i = 0;
#if defined(__AVX2__)
for (; i + 15 < nb; i += 16) {
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
__m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
int mask = _mm256_movemask_epi8(cmp);
if (mask) {
for (size_t j = 0; j < 16; ++j) {
if (!validate_fp16(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#elif defined(__ARM_NEON)
for (; i + 7 < nb; i += 8) {
uint16x8_t v = vld1q_u16(f + i);
uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
if (mask) {
for (size_t j = 0; j < 8; ++j) {
if (!validate_fp16(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#endif
for (; i < nb; ++i) {
if (!validate_fp16(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_F32:
{
const float * f = (const float *) data;
size_t i = 0;
#if defined(__AVX2__)
for (; i + 7 < nb; i += 8) {
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
__m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
int mask = _mm256_movemask_epi8(cmp);
if (mask) {
for (size_t j = 0; j < 8; ++j) {
if (!validate_float(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#elif defined(__ARM_NEON)
for (; i + 3 < nb; i += 4) {
uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
if (mask) {
for (size_t j = 0; j < 4; ++j) {
if (!validate_float(f[i + j], i + j)) {
return false;
}
}
GGML_UNREACHABLE();
}
}
#endif
for (; i < nb; ++i) {
if (!validate_float(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_F64:
{
const double * f = (const double *) data;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(f[i], i)) {
return false;
}
}
} break;
case GGML_TYPE_Q4_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
} break;
case GGML_TYPE_Q4_1:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
} break;
case GGML_TYPE_Q5_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
} break;
case GGML_TYPE_Q5_1:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
} break;
case GGML_TYPE_Q8_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
} break;
case GGML_TYPE_Q2_K:
{
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
} break;
case GGML_TYPE_Q3_K:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
} break;
case GGML_TYPE_Q4_K:
{
#ifdef GGML_QKK_64
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
#else
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
#endif
} break;
case GGML_TYPE_Q5_K:
{
#ifdef GGML_QKK_64
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
#else
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
#endif
} break;
case GGML_TYPE_Q6_K:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
} break;
case GGML_TYPE_Q8_K:
{
const block_q8_K * q = (const block_q8_K *) data;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(q[i].d, i)) {
return false;
}
}
} break;
case GGML_TYPE_IQ1_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
} break;
case GGML_TYPE_IQ1_M:
{
const block_iq1_m * q = (const block_iq1_m *) data;
for (size_t i = 0; i < nb; ++i) {
#if QK_K == 64
if (!validate_fp16(q[i].d, i)) {
return false;
}
#else
iq1m_scale_t scale;
const uint16_t * sc = (const uint16_t *)q[i].scales;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
if (!validate_fp16(scale.f16, i)) {
return false;
}
#endif
}
} break;
case GGML_TYPE_IQ2_XXS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
} break;
case GGML_TYPE_IQ2_XS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
} break;
case GGML_TYPE_IQ2_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
} break;
case GGML_TYPE_IQ3_XXS:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
} break;
case GGML_TYPE_IQ3_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
} break;
case GGML_TYPE_IQ4_XS:
#if QK_K != 64
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
} break;
#endif
// with QK_K == 64, iq4_xs is iq4_nl
case GGML_TYPE_IQ4_NL:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break;
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
// nothing to validate
break;
default:
{
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
return false;
}
}
return true;
}

View file

@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
version += std::to_string(prop.get_minor_version()); version += std::to_string(prop.get_minor_version());
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
std::string name = std::string(prop.get_name());
name = std::regex_replace(name, std::regex("\\(R\\)"), "");
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), auto global_mem_size = prop.get_global_mem_size()/1000000;
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
prop.get_global_mem_size()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
} }
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
int device_count = dpct::dev_mgr::instance().device_count(); int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums; std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count); fprintf(stderr, "found %d SYCL devices:\n", device_count);
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend(); sycl::backend backend = device.get_backend();
@ -14738,7 +14744,12 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const ggml_tensor * src2 = dst->src[2];
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -14754,7 +14765,6 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
float * src2_dd = nullptr; float * src2_dd = nullptr;
sycl_pool_alloc<float> src2_f; sycl_pool_alloc<float> src2_f;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr; const bool use_src2 = src2 != nullptr;
if (use_src2) { if (use_src2) {

View file

@ -3178,6 +3178,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
} }
return nullptr; return nullptr;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 and src2 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32);
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32);
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_soft_max_f32; return ctx->device->pipeline_soft_max_f32;
} }

420
ggml.c
View file

@ -951,7 +951,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i]) #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL #define GGML_F16_VEC_MUL GGML_F16x8_MUL
@ -977,7 +977,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
@ -1046,7 +1046,7 @@ do { \
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
// so F16C guard isn't required // so F16C guard isn't required
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
@ -1144,7 +1144,7 @@ do { \
#if defined(__F16C__) #if defined(__F16C__)
// the _mm256_cvt intrinsics require F16C // the _mm256_cvt intrinsics require F16C
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
#else #else
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
@ -1662,6 +1662,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
#endif #endif
} }
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
GGML_F16_VEC ax[GGML_F16_ARR];
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#endif
}
// xs and vs are byte strides of x and v // xs and vs are byte strides of x and v
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
@ -1746,6 +1777,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#endif #endif
} }
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
}
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
}
#endif
}
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
@ -2000,6 +2060,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"LEAKY_RELU", "LEAKY_RELU",
"FLASH_ATTN", "FLASH_ATTN",
"FLASH_ATTN_EXT",
"FLASH_FF", "FLASH_FF",
"FLASH_ATTN_BACK", "FLASH_ATTN_BACK",
"SSM_CONV", "SSM_CONV",
@ -2026,7 +2087,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK", "CROSS_ENTROPY_LOSS_BACK",
}; };
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -2090,6 +2151,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"leaky_relu(x)", "leaky_relu(x)",
"flash_attn(x)", "flash_attn(x)",
"flash_attn_ext(x)",
"flash_ff(x)", "flash_ff(x)",
"flash_attn_back(x)", "flash_attn_back(x)",
"ssm_conv(x)", "ssm_conv(x)",
@ -2116,7 +2178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)", "cross_entropy_loss_back(x,y)",
}; };
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4559,6 +4621,8 @@ struct ggml_tensor * ggml_mul_mat(
void ggml_mul_mat_set_prec( void ggml_mul_mat_set_prec(
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_prec prec) { enum ggml_prec prec) {
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
const int32_t prec_i32 = (int32_t) prec; const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 0, prec_i32); ggml_set_op_params_i32(a, 0, prec_i32);
@ -5397,17 +5461,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_is_contiguous(a));
if (mask) { if (mask) {
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(mask)); GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(ggml_is_matrix(mask)); GGML_ASSERT(ggml_is_matrix(mask));
GGML_ASSERT(ggml_can_repeat_rows(mask, a)); GGML_ASSERT(mask->ne[0] == a->ne[0]);
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
} }
if (pos) { if (pos) {
GGML_ASSERT(ggml_is_vector(pos)); GGML_ASSERT(ggml_is_vector(pos));
GGML_ASSERT(pos->type == GGML_TYPE_F32); GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
GGML_ASSERT(pos->ne[0] == a->ne[0]); GGML_ASSERT(pos->ne[0] == a->ne[0]);
} }
if (pos && mask) {
GGML_ASSERT(pos->type == mask->type);
}
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
GGML_ASSERT(pos); GGML_ASSERT(pos);
} }
@ -6216,6 +6286,59 @@ struct ggml_tensor * ggml_flash_attn(
return result; return result;
} }
// ggml_flash_attn_ext
struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale) {
GGML_ASSERT(ggml_can_mul_mat(k, q));
// TODO: check if vT can be multiplied by (k*qT)
if (mask) {
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[2] == 1);
GGML_ASSERT(mask->ne[3] == 1);
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
}
bool is_node = false;
if (q->grad || k->grad || v->grad) {
is_node = true;
}
// permute(0, 2, 1, 3)
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
float params[] = { scale };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_FLASH_ATTN_EXT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = q;
result->src[1] = k;
result->src[2] = v;
result->src[3] = mask;
return result;
}
void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a,
enum ggml_prec prec) {
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
}
// ggml_flash_ff // ggml_flash_ff
struct ggml_tensor * ggml_flash_ff( struct ggml_tensor * ggml_flash_ff(
@ -12255,7 +12378,7 @@ static void ggml_compute_forward_soft_max_f32(
GGML_TENSOR_UNARY_OP_LOCALS GGML_TENSOR_UNARY_OP_LOCALS
const int64_t ne11 = src1 ? src1->ne[1] : 1; //const int64_t ne11 = src1 ? src1->ne[1] : 1;
// TODO: is this supposed to be ceil instead of floor? // TODO: is this supposed to be ceil instead of floor?
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
@ -12278,19 +12401,31 @@ static void ggml_compute_forward_soft_max_f32(
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
float * pos = src2 ? (float *) src2->data : src0->data; ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
for (int i1 = ir0; i1 < ir1; i1++) { for (int i1 = ir0; i1 < ir1; i1++) {
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
// broadcast the mask across rows // broadcast the mask across rows
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL; ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
ggml_vec_cpy_f32 (nc, wp, sp); ggml_vec_cpy_f32 (nc, wp, sp);
ggml_vec_scale_f32(nc, wp, scale); ggml_vec_scale_f32(nc, wp, scale);
if (mp) { if (mp_f32) {
ggml_vec_acc_f32(nc, wp, mp); if (use_f16) {
for (int i = 0; i < nc; ++i) {
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
}
} else {
for (int i = 0; i < nc; ++i) {
wp[i] += mp_f32[i];
}
}
} }
// ALiBi bias // ALiBi bias
@ -12298,8 +12433,14 @@ static void ggml_compute_forward_soft_max_f32(
const uint32_t h = (i1/ne01)%ne02; // head const uint32_t h = (i1/ne01)%ne02; // head
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
for (int i = 0; i < nc; i++) { if (use_f16) {
wp[i] = wp[i] + slope*pos[i]; for (int i = 0; i < nc; ++i) {
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
}
} else {
for (int i = 0; i < nc; ++i) {
wp[i] += slope*pos_f32[i];
}
} }
} }
@ -14569,6 +14710,198 @@ static void ggml_compute_forward_flash_attn(
} }
} }
// ggml_compute_forward_flash_attn_ext
static void ggml_compute_forward_flash_attn_ext_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * q,
const struct ggml_tensor * k,
const struct ggml_tensor * v,
const struct ggml_tensor * mask,
struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith;
const int nth = params->nth;
const int64_t D = neq0;
const int64_t N = neq1;
GGML_ASSERT(ne0 == D);
GGML_ASSERT(ne2 == N);
GGML_ASSERT(nbq0 == sizeof(float));
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
GGML_ASSERT(neq0 == D);
GGML_ASSERT(nek0 == D);
GGML_ASSERT(nev0 == D);
GGML_ASSERT(neq1 == N);
GGML_ASSERT(nev0 == D);
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// broadcast factors
const int64_t rk2 = neq2/nek2;
const int64_t rk3 = neq3/nek3;
const int64_t rv2 = neq2/nev2;
const int64_t rv3 = neq3/nev3;
if (params->type == GGML_TASK_TYPE_INIT) {
return;
}
if (params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
// parallelize by q rows using ggml_vec_dot_f32
// total rows in q
const int nr = neq1*neq2*neq3;
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
float scale = 1.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
// q indices
const int iq3 = ir/(neq2*neq1);
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
float S = 0.0f;
float M = -INFINITY;
float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
memset(V16, 0, D*sizeof(ggml_fp16_t));
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
// k indices
const int ik3 = iq3 / rk3;
const int ik2 = iq2 / rk2;
// v indices
const int iv3 = iq3 / rv3;
const int iv2 = iq2 / rv2;
// online softmax / attention
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
for (int64_t ic = 0; ic < nek1; ++ic) {
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
if (mv == -INFINITY) {
continue;
}
float s;
// convert Q to F16 in V32
{
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
for (int64_t d = 0; d < D; ++d) {
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
}
}
ggml_vec_dot_f16(D,
&s, 0,
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
Q16, 0, 1);
s = s*scale + mv;
const float Mold = M;
float ms = 1.0f;
float vs = 1.0f;
if (s > M) {
M = s;
ms = expf(Mold - M);
// V = V*expf(Mold - M)
ggml_vec_scale_f16(D, V16, ms);
} else {
vs = expf(s - M);
}
const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
// V += v*expf(s - M)
ggml_vec_mad_f16(D, V16, v16, vs);
S = S*ms + vs;
}
// V /= S
for (int64_t d = 0; d < D; ++d) {
V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
}
// dst indices
const int i1 = iq1;
const int i2 = iq2;
const int i3 = iq3;
// original
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
// permute(0, 2, 1, 3)
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
}
}
static void ggml_compute_forward_flash_attn_ext(
const struct ggml_compute_params * params,
const struct ggml_tensor * q,
const struct ggml_tensor * k,
const struct ggml_tensor * v,
const struct ggml_tensor * mask,
struct ggml_tensor * dst) {
switch (dst->op_params[1]) {
case GGML_PREC_DEFAULT:
case GGML_PREC_F32:
{
// uses F32 accumulators
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_flash_ff // ggml_compute_forward_flash_ff
static void ggml_compute_forward_flash_ff_f16( static void ggml_compute_forward_flash_ff_f16(
@ -16376,6 +16709,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
const bool masked = t != 0; const bool masked = t != 0;
ggml_compute_forward_flash_attn(params, masked, tensor); ggml_compute_forward_flash_attn(params, masked, tensor);
} break; } break;
case GGML_OP_FLASH_ATTN_EXT:
{
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
} break;
case GGML_OP_FLASH_FF: case GGML_OP_FLASH_FF:
{ {
ggml_compute_forward_flash_ff(params, tensor); ggml_compute_forward_flash_ff(params, tensor);
@ -17388,6 +17725,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
} break; } break;
case GGML_OP_FLASH_ATTN: case GGML_OP_FLASH_ATTN:
case GGML_OP_FLASH_ATTN_EXT:
{ {
struct ggml_tensor * flash_grad = NULL; struct ggml_tensor * flash_grad = NULL;
if (src0->grad || src1->grad || tensor->src[2]->grad) { if (src0->grad || src1->grad || tensor->src[2]->grad) {
@ -18160,6 +18498,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_FLASH_ATTN: case GGML_OP_FLASH_ATTN:
case GGML_OP_FLASH_ATTN_EXT:
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
@ -18563,6 +18902,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
} }
} break; } break;
case GGML_OP_FLASH_ATTN_EXT:
{
const int64_t ne00 = node->src[0]->ne[0]; // D
cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
} break;
case GGML_OP_FLASH_FF: case GGML_OP_FLASH_FF:
{ {
if (node->src[1]->type == GGML_TYPE_F32) { if (node->src[1]->type == GGML_TYPE_F32) {
@ -20614,7 +20959,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
} }
struct gguf_context * gguf_init_empty(void) { struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION; ctx->header.version = GGUF_VERSION;
@ -20659,7 +21004,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
bool ok = true; bool ok = true;
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
// read the header // read the header
{ {
@ -20696,9 +21041,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs // read the kv pairs
{ {
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); const uint64_t n_kv = ctx->header.n_kv;
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
ctx->header.n_kv = 0;
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
for (uint64_t i = 0; i < n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i]; struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i); //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -20747,7 +21096,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL; return NULL;
} }
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type)); kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
} break; } break;
@ -20761,7 +21110,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL; return NULL;
} }
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str)); kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
for (uint64_t j = 0; j < kv->value.arr.n; ++j) { for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@ -20777,6 +21126,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) { if (!ok) {
break; break;
} }
ctx->header.n_kv++;
} }
if (!ok) { if (!ok) {
@ -20788,8 +21139,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
} }
// read the tensor infos // read the tensor infos
{ if (ctx->header.n_tensors > 0) {
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
@ -20810,8 +21161,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
// TODO: return an error instead of crashing with GGML_ASSERT
gguf_tensor_info_sanitize(info); gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;
}
}
if (!ok) { if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__); fprintf(stderr, "%s: failed to read tensor info\n", __func__);
fclose(file); fclose(file);
@ -20980,7 +21340,7 @@ void gguf_free(struct gguf_context * ctx) {
GGML_FREE(ctx->infos); GGML_FREE(ctx->infos);
} }
GGML_ALIGNED_FREE(ctx); GGML_FREE(ctx);
} }
const char * gguf_type_name(enum gguf_type type) { const char * gguf_type_name(enum gguf_type type) {
@ -21291,7 +21651,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.type = type;
ctx->kv[idx].value.arr.n = n; ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type)); ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
} }
@ -21301,7 +21661,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
ctx->kv[idx].value.arr.n = n; ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str)); ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
str->n = strlen(data[i]); str->n = strlen(data[i]);
@ -21328,7 +21688,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
{ {
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
} }
@ -21348,6 +21708,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
void gguf_add_tensor( void gguf_add_tensor(
struct gguf_context * ctx, struct gguf_context * ctx,
const struct ggml_tensor * tensor) { const struct ggml_tensor * tensor) {
if (gguf_find_tensor(ctx, tensor->name) != -1) {
GGML_ASSERT(false && "duplicated tensor name");
}
const int idx = ctx->header.n_tensors; const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -21416,7 +21780,7 @@ struct gguf_buf {
static struct gguf_buf gguf_buf_init(size_t size) { static struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = { struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size), /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
/*buf.size =*/ size, /*buf.size =*/ size,
/*buf.offset =*/ 0, /*buf.offset =*/ 0,
}; };

22
ggml.h
View file

@ -475,6 +475,7 @@ extern "C" {
GGML_OP_LEAKY_RELU, GGML_OP_LEAKY_RELU,
GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_ATTN_EXT,
GGML_OP_FLASH_FF, GGML_OP_FLASH_FF,
GGML_OP_FLASH_ATTN_BACK, GGML_OP_FLASH_ATTN_BACK,
GGML_OP_SSM_CONV, GGML_OP_SSM_CONV,
@ -762,6 +763,8 @@ extern "C" {
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
// main // main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -1720,6 +1723,25 @@ extern "C" {
struct ggml_tensor * v, struct ggml_tensor * v,
bool masked); bool masked);
#define GGML_KQ_MASK_PAD 32
// q: [n_embd, n_batch, n_head, 1]
// k: [n_embd, n_kv, n_head_kv, 1]
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale);
GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a,
enum ggml_prec prec);
GGML_API struct ggml_tensor * ggml_flash_attn_back( GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * q, struct ggml_tensor * q,

View file

@ -1,11 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
import logging
import argparse import argparse
import asyncio import asyncio
import os import os
import sys import sys
from tempfile import gettempdir, NamedTemporaryFile from tempfile import gettempdir, NamedTemporaryFile
logger = logging.getLogger("ggml-vk-generate-shaders")
shader_f32 = """ shader_f32 = """
#define FLOAT_TYPE float #define FLOAT_TYPE float
""" """
@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True):
stdout, stderr = await proc.communicate() stdout, stderr = await proc.communicate()
print(" ".join(cmd)) logger.info(" ".join(cmd))
if proc.returncode: if proc.returncode:
raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}") raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}")
@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True):
cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}") logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}")
f.close() f.close()
os.remove(f.name) os.remove(f.name)
sys.exit(proc.returncode) sys.exit(proc.returncode)
@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True):
async def main(): async def main():
print("ggml_vulkan: Generating and compiling shaders to SPIR-V") logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
tasks = [] tasks = []
@ -2768,9 +2771,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator") parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
parser.add_argument("--glslc", help="Path to glslc") parser.add_argument("--glslc", help="Path to glslc")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.glslc: if args.glslc:
GLSLC = args.glslc GLSLC = args.glslc

View file

@ -1,8 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import logging
import sys import sys
from pathlib import Path from pathlib import Path
from gguf.gguf_reader import GGUFReader from gguf.gguf_reader import GGUFReader
logger = logging.getLogger("reader")
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path):
reader = GGUFReader(gguf_file_path) reader = GGUFReader(gguf_file_path)
# List all key-value pairs in a columnized format # List all key-value pairs in a columnized format
print("Key-Value Pairs:") print("Key-Value Pairs:") # noqa: NP100
max_key_length = max(len(key) for key in reader.fields.keys()) max_key_length = max(len(key) for key in reader.fields.keys())
for key, field in reader.fields.items(): for key, field in reader.fields.items():
value = field.parts[field.data[0]] value = field.parts[field.data[0]]
print(f"{key:{max_key_length}} : {value}") print(f"{key:{max_key_length}} : {value}") # noqa: NP100
print("----") print("----") # noqa: NP100
# List all tensors # List all tensors
print("Tensors:") print("Tensors:") # noqa: NP100
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
print("-" * 80) print("-" * 80) # noqa: NP100
for tensor in reader.tensors: for tensor in reader.tensors:
shape_str = "x".join(map(str, tensor.shape)) shape_str = "x".join(map(str, tensor.shape))
size_str = str(tensor.n_elements) size_str = str(tensor.n_elements)
quantization_str = tensor.tensor_type.name quantization_str = tensor.tensor_type.name
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Usage: reader.py <path_to_gguf_file>") logger.info("Usage: reader.py <path_to_gguf_file>")
sys.exit(1) sys.exit(1)
gguf_file_path = sys.argv[1] gguf_file_path = sys.argv[1]
read_gguf_file(gguf_file_path) read_gguf_file(gguf_file_path)

View file

@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import sys
from enum import Enum, IntEnum, auto from enum import Enum, IntEnum, auto
from typing import Any from typing import Any
@ -72,6 +71,7 @@ class Keys:
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
@ -853,8 +853,7 @@ class GGUFValueType(IntEnum):
return GGUFValueType.INT32 return GGUFValueType.INT32
# TODO: need help with 64-bit types in Python # TODO: need help with 64-bit types in Python
else: else:
print("Unknown type:", type(val)) raise ValueError(f"Unknown type: {type(val)}")
sys.exit()
# Note: Does not support GGML_QKK_64 # Note: Does not support GGML_QKK_64
@ -940,6 +939,7 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
# tokenization # tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES

View file

@ -4,6 +4,7 @@
# #
from __future__ import annotations from __future__ import annotations
import logging
import os import os
from collections import OrderedDict from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union from typing import Any, Literal, NamedTuple, TypeVar, Union
@ -27,6 +28,7 @@ from gguf.constants import (
GGUFValueType, GGUFValueType,
) )
logger = logging.getLogger(__name__)
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
@ -139,7 +141,12 @@ class GGUFReader:
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields: if field.name in self.fields:
raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') # TODO: add option to generate error on duplicate keys
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
self.fields[field.name + '_{}'.format(field.offset)] = field
else:
self.fields[field.name] = field self.fields[field.name] = field
return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
@ -234,8 +241,14 @@ class GGUFReader:
def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
tensors = [] tensors = []
tensor_names = set() # keep track of name to prevent duplicated tensors
for field in fields: for field in fields:
_name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
# check if there's any tensor having same name already in the list
tensor_name = str(bytes(name_data), encoding = 'utf-8')
if tensor_name in tensor_names:
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0]) ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = np.prod(dims) n_elems = np.prod(dims)
block_size, type_size = GGML_QUANT_SIZES[ggml_type] block_size, type_size = GGML_QUANT_SIZES[ggml_type]
@ -267,7 +280,7 @@ class GGUFReader:
item_count = n_bytes item_count = n_bytes
item_type = np.uint8 item_type = np.uint8
tensors.append(ReaderTensor( tensors.append(ReaderTensor(
name = str(bytes(name_data), encoding = 'utf-8'), name = tensor_name,
tensor_type = ggml_type, tensor_type = ggml_type,
shape = dims, shape = dims,
n_elements = n_elems, n_elements = n_elems,

View file

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import logging
import os import os
import shutil import shutil
import struct import struct
@ -24,6 +25,8 @@ from .constants import (
TokenType, TokenType,
) )
logger = logging.getLogger(__name__)
class WriterState(Enum): class WriterState(Enum):
EMPTY = auto() EMPTY = auto()
@ -63,10 +66,11 @@ class GGUFWriter:
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = bytearray() self.ti_data = bytearray()
self.ti_data_count = 0 self.ti_data_count = 0
self.ti_names = set()
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.temp_file = None self.temp_file = None
self.tensors = [] self.tensors = []
print("gguf: This GGUF file is for {0} Endian only".format( logger.info("gguf: This GGUF file is for {0} Endian only".format(
"Big" if self.endianess == GGUFEndian.BIG else "Little", "Big" if self.endianess == GGUFEndian.BIG else "Little",
)) ))
self.state = WriterState.EMPTY self.state = WriterState.EMPTY
@ -197,6 +201,10 @@ class GGUFWriter:
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
if name in self.ti_names:
raise ValueError(f'Duplicated tensor name {name}')
self.ti_names.add(name)
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
@ -422,6 +430,9 @@ class GGUFWriter:
def add_tokenizer_model(self, model: str) -> None: def add_tokenizer_model(self, model: str) -> None:
self.add_string(Keys.Tokenizer.MODEL, model) self.add_string(Keys.Tokenizer.MODEL, model)
def add_tokenizer_pre(self, pre: str) -> None:
self.add_string(Keys.Tokenizer.PRE, pre)
def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
self.add_array(Keys.Tokenizer.LIST, tokens) self.add_array(Keys.Tokenizer.LIST, tokens)

View file

@ -1,13 +1,15 @@
from __future__ import annotations from __future__ import annotations
import logging
import json import json
import os import os
import sys
from pathlib import Path from pathlib import Path
from typing import Any, Callable from typing import Any, Callable
from .gguf_writer import GGUFWriter from .gguf_writer import GGUFWriter
logger = logging.getLogger(__name__)
class SpecialVocab: class SpecialVocab:
merges: list[str] merges: list[str]
@ -40,38 +42,29 @@ class SpecialVocab:
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if self.merges: if self.merges:
if not quiet: if not quiet:
print(f'gguf: Adding {len(self.merges)} merge(s).') logger.info(f'Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges) gw.add_token_merges(self.merges)
elif self.load_merges: elif self.load_merges:
print( logger.warning('Adding merges requested but no merges found, output may be non-functional.')
'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
file = sys.stderr,
)
for typ, tokid in self.special_token_ids.items(): for typ, tokid in self.special_token_ids.items():
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if id_handler is None: if id_handler is None:
print( logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
file = sys.stderr,
)
continue continue
if not quiet: if not quiet:
print(f'gguf: Setting special token type {typ} to {tokid}') logger.info(f'Setting special token type {typ} to {tokid}')
id_handler(tokid) id_handler(tokid)
for typ, value in self.add_special_token.items(): for typ, value in self.add_special_token.items():
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
if add_handler is None: if add_handler is None:
print( logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
file = sys.stderr,
)
continue continue
if not quiet: if not quiet:
print(f'gguf: Setting add_{typ}_token to {value}') logger.info(f'Setting add_{typ}_token to {value}')
add_handler(value) add_handler(value)
if self.chat_template is not None: if self.chat_template is not None:
if not quiet: if not quiet:
print(f'gguf: Setting chat_template to {self.chat_template}') logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template) gw.add_chat_template(self.chat_template)
def _load(self, path: Path) -> None: def _load(self, path: Path) -> None:
@ -99,10 +92,7 @@ class SpecialVocab:
continue continue
parts = line.split(None, 3) parts = line.split(None, 3)
if len(parts) != 2: if len(parts) != 2:
print( logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
file = sys.stderr,
)
continue continue
merges.append(f'{parts[0]} {parts[1]}') merges.append(f'{parts[0]} {parts[1]}')
self.merges = merges self.merges = merges
@ -118,10 +108,7 @@ class SpecialVocab:
return return
self.special_token_ids[typ] = tid self.special_token_ids[typ] = tid
return return
print( logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
file = sys.stderr,
)
def _try_load_from_tokenizer_json(self, path: Path) -> bool: def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json' tokenizer_file = path / 'tokenizer.json'
@ -144,10 +131,7 @@ class SpecialVocab:
if chat_template is None or isinstance(chat_template, (str, list)): if chat_template is None or isinstance(chat_template, (str, list)):
self.chat_template = chat_template self.chat_template = chat_template
else: else:
print( logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
file = sys.stderr
)
for typ in self.special_token_types: for typ in self.special_token_types:
add_entry = tokenizer_config.get(f'add_{typ}_token') add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool): if isinstance(add_entry, bool):

View file

@ -1,9 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import os import os
import sys import sys
from tqdm import tqdm
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
import gguf import gguf
logger = logging.getLogger("gguf-convert-endian")
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
else: else:
file_endian = host_endian file_endian = host_endian
order = host_endian if args.order == "native" else args.order order = host_endian if args.order == "native" else args.order
print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
if file_endian == order: if file_endian == order:
print(f"* File is already {order.upper()} endian. Nothing to do.") logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
sys.exit(0) sys.exit(0)
print("* Checking tensors for conversion compatibility") logger.info("* Checking tensors for conversion compatibility")
for tensor in reader.tensors: for tensor in reader.tensors:
if tensor.tensor_type not in ( if tensor.tensor_type not in (
gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F32,
@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
gguf.GGMLQuantizationType.Q8_0, gguf.GGMLQuantizationType.Q8_0,
): ):
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
if args.dry_run: if args.dry_run:
return return
print("\n*** Warning *** Warning *** Warning **") logger.warning("*** Warning *** Warning *** Warning **")
print("* This conversion process may damage the file. Ensure you have a backup.") logger.warning("* This conversion process may damage the file. Ensure you have a backup.")
if order != host_endian: if order != host_endian:
print("* Requested endian differs from host, you will not be able to load the model on this machine.") logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.")
print("* The file will be modified immediately, so if conversion fails or is interrupted") logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted")
print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
response = input("YES, I am sure> ") response = input("YES, I am sure> ")
if response != "YES": if response != "YES":
print("You didn't enter YES. Okay then, see ya!") logger.warning("You didn't enter YES. Okay then, see ya!")
sys.exit(0) sys.exit(0)
print(f"\n* Converting fields ({len(reader.fields)})") logger.info(f"* Converting fields ({len(reader.fields)})")
for idx, field in enumerate(reader.fields.values()): for idx, field in enumerate(reader.fields.values()):
print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
for part in field.parts: for part in field.parts:
part.byteswap(inplace=True) part.byteswap(inplace=True)
print(f"\n* Converting tensors ({len(reader.tensors)})") logger.info(f"* Converting tensors ({len(reader.tensors)})")
for idx, tensor in enumerate(reader.tensors):
print( for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")):
f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " log_message = (
f"elements={tensor.n_elements}... ", f"Converting tensor {repr(tensor.name)}, "
end="", f"type={tensor.tensor_type.name}, "
f"elements={tensor.n_elements} "
) )
tensor_type = tensor.tensor_type
# Byte-swap each part of the tensor's field
for part in tensor.field.parts: for part in tensor.field.parts:
part.byteswap(inplace=True) part.byteswap(inplace=True)
if tensor_type != gguf.GGMLQuantizationType.Q8_0:
tensor.data.byteswap(inplace=True) # Byte-swap tensor data if necessary
print() if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
continue # Handle Q8_0 tensor blocks (block_q8_0)
# A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes # Specific handling of block_q8_0 is required.
block_size = 34 # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
n_blocks = len(tensor.data) // block_size n_blocks = len(tensor.data) // block_size
for block_num in range(n_blocks): for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
block_offs = block_num * block_size block_offs = block_num * block_size
# I know I said f16, but it doesn't matter here - any simple 16 bit type works.
# Byte-Swap f16 sized delta field
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True) delta.byteswap(inplace=True)
# Byte-Swap Q8 weights
if block_num % 100000 == 0: if block_num % 100000 == 0:
print(f"[{(n_blocks - block_num) // 1000}K]", end="") inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
sys.stdout.flush()
print() else:
print("* Completion") # Handle other tensor types
tensor.data.byteswap(inplace=True)
pbar.set_description(log_message)
logger.info("* Completion")
def main() -> None: def main() -> None:
@ -102,8 +119,13 @@ def main() -> None:
"--dry-run", action="store_true", "--dry-run", action="store_true",
help="Don't actually change anything", help="Don't actually change anything",
) )
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
print(f'* Loading: {args.model}')
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
logger.info(f'* Loading: {args.model}')
reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
convert_byteorder(reader, args) convert_byteorder(reader, args)

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import os import os
import sys import sys
@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
from gguf import GGUFReader, GGUFValueType # noqa: E402 from gguf import GGUFReader, GGUFValueType # noqa: E402
logger = logging.getLogger("gguf-dump")
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
# please see the comments in the modify_gguf.py example. # please see the comments in the modify_gguf.py example.
def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
host_endian, file_endian = get_file_host_endian(reader) host_endian, file_endian = get_file_host_endian(reader)
print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100
print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100
for n, field in enumerate(reader.fields.values(), 1): for n, field in enumerate(reader.fields.values(), 1):
if not field.types: if not field.types:
pretty_type = 'N/A' pretty_type = 'N/A'
@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
else: else:
pretty_type = str(field.types[-1].name) pretty_type = str(field.types[-1].name)
print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
if len(field.types) == 1: if len(field.types) == 1:
curr_type = field.types[0] curr_type = field.types[0]
if curr_type == GGUFValueType.STRING: if curr_type == GGUFValueType.STRING:
print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60]))
elif field.types[0] in reader.gguf_scalar_to_np: elif field.types[0] in reader.gguf_scalar_to_np:
print(' = {0}'.format(field.parts[-1][0]), end = '') log_message += ' = {0}'.format(field.parts[-1][0])
print() print(log_message) # noqa: NP100
if args.no_tensors: if args.no_tensors:
return return
print(f'\n* Dumping {len(reader.tensors)} tensor(s)') print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100
for n, tensor in enumerate(reader.tensors, 1): for n, tensor in enumerate(reader.tensors, 1):
prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100
def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
@ -103,10 +107,17 @@ def main() -> None:
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
parser.add_argument("--json", action="store_true", help="Produce JSON output") parser.add_argument("--json", action="store_true", help="Produce JSON output")
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if not args.json: if not args.json:
print(f'* Loading: {args.model}') logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r') reader = GGUFReader(args.model, 'r')
if args.json: if args.json:
dump_metadata_json(reader, args) dump_metadata_json(reader, args)
else: else:

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import logging
import argparse import argparse
import os import os
import sys import sys
@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
from gguf import GGUFReader # noqa: E402 from gguf import GGUFReader # noqa: E402
logger = logging.getLogger("gguf-set-metadata")
def minimal_example(filename: str) -> None: def minimal_example(filename: str) -> None:
reader = GGUFReader(filename, 'r+') reader = GGUFReader(filename, 'r+')
@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None:
def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
field = reader.get_field(args.key) field = reader.get_field(args.key)
if field is None: if field is None:
print(f'! Field {repr(args.key)} not found', file = sys.stderr) logger.error(f'! Field {repr(args.key)} not found')
sys.exit(1) sys.exit(1)
# Note that field.types is a list of types. This is because the GGUF # Note that field.types is a list of types. This is because the GGUF
# format supports arrays. For example, an array of UINT32 would # format supports arrays. For example, an array of UINT32 would
# look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
if handler is None: if handler is None:
print( logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}')
f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
file = sys.stderr,
)
sys.exit(1) sys.exit(1)
current_value = field.parts[field.data[0]][0] current_value = field.parts[field.data[0]][0]
new_value = handler(args.value) new_value = handler(args.value)
print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
if current_value == new_value: if current_value == new_value:
print(f'- Key {repr(args.key)} already set to requested value {current_value}') logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}')
sys.exit(0) sys.exit(0)
if args.dry_run: if args.dry_run:
sys.exit(0) sys.exit(0)
if not args.force: if not args.force:
print('*** Warning *** Warning *** Warning **') logger.warning('*** Warning *** Warning *** Warning **')
print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
print('* Enter exactly YES if you are positive you want to proceed:') logger.warning('* Enter exactly YES if you are positive you want to proceed:')
response = input('YES, I am sure> ') response = input('YES, I am sure> ')
if response != 'YES': if response != 'YES':
print("You didn't enter YES. Okay then, see ya!") logger.info("You didn't enter YES. Okay then, see ya!")
sys.exit(0) sys.exit(0)
field.parts[field.data[0]][0] = new_value field.parts[field.data[0]][0] = new_value
print('* Field changed. Successful completion.') logger.info('* Field changed. Successful completion.')
def main() -> None: def main() -> None:
@ -80,8 +80,13 @@ def main() -> None:
parser.add_argument("value", type=str, help="Metadata value to set") parser.add_argument("value", type=str, help="Metadata value to set")
parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
parser.add_argument("--force", action="store_true", help="Change the field without confirmation") parser.add_argument("--force", action="store_true", help="Change the field without confirmation")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
print(f'* Loading: {args.model}')
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
set_metadata(reader, args) set_metadata(reader, args)

848
llama.cpp

File diff suppressed because it is too large Load diff

34
llama.h
View file

@ -40,7 +40,7 @@
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 5 #define LLAMA_SESSION_VERSION 6
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 1 #define LLAMA_STATE_SEQ_VERSION 1
@ -69,6 +69,20 @@ extern "C" {
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
}; };
// pre-tokenization types
enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
};
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope
// TODO: maybe move this enum to ggml.h (ggml_rope_type) // TODO: maybe move this enum to ggml.h (ggml_rope_type)
enum llama_rope_type { enum llama_rope_type {
@ -159,7 +173,7 @@ extern "C" {
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void *ctx); typedef bool (*llama_progress_callback)(float progress, void * user_data);
// Input data for llama_decode // Input data for llama_decode
// A llama_batch object can contain input about one or many sequences // A llama_batch object can contain input about one or many sequences
@ -195,15 +209,19 @@ extern "C" {
LLAMA_KV_OVERRIDE_TYPE_INT, LLAMA_KV_OVERRIDE_TYPE_INT,
LLAMA_KV_OVERRIDE_TYPE_FLOAT, LLAMA_KV_OVERRIDE_TYPE_FLOAT,
LLAMA_KV_OVERRIDE_TYPE_BOOL, LLAMA_KV_OVERRIDE_TYPE_BOOL,
LLAMA_KV_OVERRIDE_TYPE_STR,
}; };
struct llama_model_kv_override { struct llama_model_kv_override {
char key[128];
enum llama_model_kv_override_type tag; enum llama_model_kv_override_type tag;
char key[128];
union { union {
int64_t int_value; int64_t val_i64;
double float_value; double val_f64;
bool bool_value; bool val_bool;
char val_str[128];
}; };
}; };
@ -235,6 +253,7 @@ extern "C" {
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
}; };
struct llama_context_params { struct llama_context_params {
@ -270,6 +289,7 @@ extern "C" {
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits) bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@ -525,7 +545,7 @@ extern "C" {
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
// Clear the KV cache // Clear the KV cache - both cell info is erased and KV data is zeroed
LLAMA_API void llama_kv_cache_clear( LLAMA_API void llama_kv_cache_clear(
struct llama_context * ctx); struct llama_context * ctx);

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
29464 2094 1018 1092 2706
11865 17875
7592 2088
7592 2088
7592 2088
7592 2088
7592 2088 999
7592 1010 2088 999
7592 1010 2088 999
2023 2003 100 1012 18133 2361
1059 2692 18139 1021 8525 28418 2243 16233 20952 6979
1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325
100
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007
7592
7592
7592
7592
7592
7592 7592
1006
1027
1005 3690
7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
1017
3943
21211
21211 2509
21211 22394
21211 22394 2509
21211 22394 22394
21211 22394 22394 2509
21211 22394 22394 22394
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
2536 228 27 228 22957 6983
45 193433
228
1667
1742
205
206
2126
11516
34777
28339 3845
46609 3845
28339 3930
46609 3930
46609 3930 8
28339 19 3845 8
46609 19 3845 8
2075 1801 11254 107 255 21 19317
94 23 27 31 228 30 21213 20752 39267 6405 9980
4929 40071 2196 3236 8750 1764 37097 41168
38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
28339
46609
228 46609
1667 46609
1742 46609
1742 46609 1856 46609
1737
206 1857
14 4515
28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
26
26 26
26 26 26
26 26 26 26
26 26 26 26 26
26 26 26 26 26 26
26 26 26 26 26 26 26
26 26 26 26 26 26 26 26
26 26 26 26 26 26 26 26 26
127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
1050 207 19 207 19192 4217
37 32009 71 6247
207
243
315
184
185
185 185
185 185 185
184 185
17535 1835
414 9489 1835
17535 5414
414 9489 5414
414 9489 5414 0
17535 11 1835 0
414 9489 11 1835 0
437 317 12394 99 234 13 14789
86 15 19 23 207 22 83 3963 27659 26078 3934 14072
1593 6478 616 2251 14994
155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218
10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8
17535
414 9489
207 414 9489
243 414 9489
315 414 9489
315 414 9489 185 315 414 9489
334
185 405
6 2895
17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
18
18 18
18 18 18
18 18 18 18
18 18 18 18 18
18 18 18 18 18 18
18 18 18 18 18 18 18
18 18 18 18 18 18 18 18
18 18 18 18 18 18 18 18 18
185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
1052 207 19 207 19109 4223
37 100014 71 6245
207
243
300
184
185
185 185
185 185 185
184 185
17464 1843
37727 1843
17464 5427
37727 5427
37727 5427 0
17464 11 1843 0
37727 11 1843 0
437 317 12356 99 234 13 14743
86 15 19 23 207 22 83 3970 27519 26016 3944 14025
1603 6476 620 91754
71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71374 210 71374 236 71374 214 155 240 210 71374 218
10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 334 5956 89213 344 643 895 1377 10728 8
17464
37727
207 37727
243 37727
300 37727
300 37727 185 300 37727
334
185 403
6 2906
17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239
18
18 18
18 18 18
18 18 18 18
18 18 18 18 18
18 18 18 18 18 18
18 18 18 18 18 18 18
18 18 18 18 18 18 18 18
18 18 18 18 18 18 18 18 18
185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
878 204 31 3068 133 2137
28611 132 30042
204
258
466
192
193
1001
11331
19125
9856 1079
23090 1079
9856 2889
23090 2889
23090 2889 12
9856 23 1079 12
23090 23 1079 12
414 304 3346 111 231 25 29247
98 55866 204 34 16682 7149 36190 6869 11481
150 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795
38154 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 38154 207 38154 233 38154 211 167 237 207 38154 215
2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 204 19 7927 53360 325 504 701 946 10930 20
9856
23090
204 23090
258 23090
466 23090
466 23090 742 23090
204 19
1212 40
18 4932
9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236
30
3138
22287
22287 30
22287 3138
22287 22287
22287 22287 30
22287 22287 3138
22287 22287 22287
1212 4824 1001 1212 192 204 663 49453 2069 742 561 1501 193 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 3346 111 231 2571 111 231 204 30 204 3138 204 22287 204 22287 30 204 22287 3138 204 22287 22287 204 22287 22287 30 204 22287 22287 3138 204 30 25 30 204 30 513 30 204 30 951 30 27171 236 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 20589 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 204 37057 2228 10666 5052 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 204 7544 7544 7544 8543 8543 17593 3513 3513 12844 51520 17664 4247 295 18 298 650 204 18 95 693 332 18 94 629 23 204 18 1553 299 1310 42 204 18 56 416 1310 295 18 567 717 334 23 204 18 47 299 606 596 6696 42 703 18 16139 241 18 87 55

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
798 604 25208 1933
37 9116 71 11751
220
220 220
220 220 220
197
198
628
628 198
197 198
15496 995
18435 995
15496 2159
18435 2159
18435 2159 0
15496 11 995 0
18435 11 995 0
428 318 12520 99 247 13 20322
86 47202 767 28047 45961 288 82 7568 13415
22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849
157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231
8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8
15496
18435
220 18435
220 220 18435
220 220 220 18435
220 220 220 18435 198 220 220 220 18435
357
198 796
6 6980
15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
18
2091
20370
24840
2091 20370
24840 2091
24840 20370
24840 24840
24840 2091 20370
198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
1142 220 19 220 27154 4038
37 51853 261
220
256
262
197
198
271
1432
1602
9906 1917
22691 1917
9906 4435
22691 4435
22691 4435 0
9906 11 1917 0
22691 11 1917 0
420 374 11410 99 247 13 11055
86 23904 220 22 83 2005 42908 11729 3013 17156
79862 102118 13373 64571 34694 3114 112203 80112
21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 21549 223 21549 249 21549 227 45358 223 21549 231
9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 320 3323 43465 430 706 1202 1866 4037 8
9906
22691
220 22691
256 22691
262 22691
262 22691 198 262 22691
320
198 284
6 11639
9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909
18
1644
8765
8765 18
8765 1644
8765 8765
8765 8765 18
8765 8765 1644
8765 8765 8765
198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

Some files were not shown because too many files have changed in this diff Show more