Merge branch 'ggerganov:gguf' into gguf

This commit is contained in:
goerch 2023-08-19 13:36:05 +02:00 committed by GitHub
commit c16ea8e193
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
67 changed files with 7635 additions and 4330 deletions

185
.github/ISSUE_TEMPLATE/custom.md vendored Normal file
View file

@ -0,0 +1,185 @@
---
name: Issue and enhancement template
about: Used to report issues and request enhancements for llama.cpp
title: "[User] Insert summary of your issue or enhancement.."
labels: ''
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Expected Behavior
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
# Current Behavior
Please provide a detailed written description of what `llama.cpp` did, instead.
# Environment and Context
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
* Physical (or virtual) hardware you are using, e.g. for Linux:
`$ lscpu`
* Operating System, e.g. for Linux:
`$ uname -a`
* SDK version, e.g. for Linux:
```
$ python3 --version
$ make --version
$ g++ --version
```
# Failure Information (for bugs)
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
# Steps to Reproduce
Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
1. step 1
2. step 2
3. step 3
4. etc.
# Failure Logs
Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
Example environment info:
```
llama.cpp$ git log | head -1
commit 2af23d30434a677c6416812eea52ccc0af65119c
llama.cpp$ lscpu | egrep "AMD|Flags"
Vendor ID: AuthenticAMD
Model name: AMD Ryzen Threadripper 1950X 16-Core Processor
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
Virtualization: AMD-V
llama.cpp$ python3 --version
Python 3.10.9
llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
numpy 1.24.2
numpydoc 1.5.0
sentencepiece 0.1.97
torch 1.13.1
torchvision 0.14.1
llama.cpp$ make --version | head -1
GNU Make 4.3
$ md5sum ./models/65B/ggml-model-q4_0.bin
dbdd682cce80e2d6e93cefc7449df487 ./models/65B/ggml-model-q4_0.bin
```
Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
```
llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
main: seed = 1679149377
llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: n_vocab = 32000
llama_model_load: n_ctx = 512
llama_model_load: n_embd = 8192
llama_model_load: n_mult = 256
llama_model_load: n_head = 64
llama_model_load: n_layer = 80
llama_model_load: n_rot = 128
llama_model_load: f16 = 2
llama_model_load: n_ff = 22016
llama_model_load: n_parts = 8
llama_model_load: ggml ctx size = 41477.73 MB
llama_model_load: memory_size = 2560.00 MB, n_mem = 40960
llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
main: prompt: 'Please close your issue when it has been answered.'
main: number of tokens in prompt = 11
1 -> ''
12148 -> 'Please'
3802 -> ' close'
596 -> ' your'
2228 -> ' issue'
746 -> ' when'
372 -> ' it'
756 -> ' has'
1063 -> ' been'
7699 -> ' answered'
29889 -> '.'
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
Please close your issue when it has been answered.
@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
main: mem per token = 71159620 bytes
main: load time = 19309.95 ms
main: sample time = 168.62 ms
main: predict time = 223895.61 ms / 888.47 ms per token
main: total time = 246406.42 ms
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
13133115082869 cycles # 3.611 GHz (16.77%)
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
23479217109614 instructions # 1.79 insn per cycle
# 0.44 stalled cycles per insn (16.76%)
2353072268027 branches # 647.002 M/sec (16.77%)
1998682780 branch-misses # 0.08% of all branches (16.76%)
247.802177522 seconds time elapsed
3618.573072000 seconds user
18.491698000 seconds sys
```

632
.github/workflows/build.yml vendored Normal file
View file

@ -0,0 +1,632 @@
name: CI
on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_NITER: 1
GGML_N_THREADS: 1
jobs:
ubuntu-focal-make:
runs-on: ubuntu-20.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8
- name: Build
id: make_build
run: |
CC=gcc-8 make
ubuntu-latest-cmake:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }}
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
mpi_library: [mpich, libopenmpi-dev]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose
macOS-latest-make:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: make_build
run: |
make
macOS-latest-cmake:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose --timeout 900
windows-latest-cmake:
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
strategy:
matrix:
include:
- build: 'noavx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
- build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
mkdir $env:RUNNER_TEMP/openblas
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support
id: check_avx512f
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
cd build
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
- name: Test
id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose --timeout 900
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cublas:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
build: ['cublas']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- uses: Jimver/cuda-toolkit@v0.2.10
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda }}
# TODO(green-sky): _dev seems to fail, and non dev are not enought
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '12.1.0' }}
# TODO(green-sky): paths are cuda 12 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '11.7.1' }}
# TODO(green-sky): paths are cuda 11 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
with:
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest
needs:
- ubuntu-focal-make
- ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
steps:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
- name: Create release
id: create_release
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
- name: Upload release
id: upload_release
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/${file}`)
});
}
}
# ubuntu-latest-gcc:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Debug, Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
#
# - name: Build
# run: |
# make
#
# ubuntu-latest-clang:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Debug, Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
#
# - name: Build
# run: |
# make
#
# ubuntu-latest-gcc-sanitized:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
#
# - name: Build
# run: |
# make
#
# windows:
# runs-on: windows-latest
#
# strategy:
# matrix:
# build: [Release]
# arch: [Win32, x64]
# include:
# - arch: Win32
# s2arc: x86
# - arch: x64
# s2arc: x64
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
#
# - name: Configure
# run: >
# cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
#
# - name: Build
# run: |
# cd ./build
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Upload binaries
# uses: actions/upload-artifact@v1
# with:
# name: llama-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
#
# windows-blas:
# runs-on: windows-latest
#
# strategy:
# matrix:
# build: [Release]
# arch: [Win32, x64]
# blas: [ON]
# include:
# - arch: Win32
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
# s2arc: x86
# - arch: x64
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
# s2arc: x64
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
#
# - name: Fetch OpenBLAS
# if: matrix.blas == 'ON'
# run: |
# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
# 7z x blas.zip -oblas -y
# copy blas/include/cblas.h .
# copy blas/include/openblas_config.h .
# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
#
# - name: Configure
# run: >
# cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
#
# - name: Build
# run: |
# cd ./build
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Copy libopenblas.dll
# if: matrix.blas == 'ON'
# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
#
# - name: Upload binaries
# if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v1
# with:
# name: llama-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
#
# emscripten:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
#
# - name: Dependencies
# run: |
# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
# tar -xvf master.tar.gz
# emsdk-master/emsdk update
# emsdk-master/emsdk install latest
# emsdk-master/emsdk activate latest
#
# - name: Configure
# run: echo "tmp"
#
# - name: Build
# run: |
# pushd emsdk-master
# source ./emsdk_env.sh
# popd
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# make

65
.github/workflows/docker.yml vendored Normal file
View file

@ -0,0 +1,65 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
# GitHub recommends pinning actions to a commit SHA.
# To get a newer version, you will need to update the SHA.
# You can also reference a tag or branch, but the action may change without warning.
name: Publish Docker image
on:
pull_request:
push:
branches:
- master
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

17
.github/workflows/editorconfig.yml vendored Normal file
View file

@ -0,0 +1,17 @@
name: EditorConfig Checker
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker

20
.github/workflows/tidy-post.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

23
.github/workflows/tidy-review.yml vendored Normal file
View file

@ -0,0 +1,23 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

2
.gitignore vendored
View file

@ -50,8 +50,8 @@ models-mnt
/embd-input-test /embd-input-test
/gguf /gguf
/gguf-llama-simple /gguf-llama-simple
/gptneox-main
/libllama.so /libllama.so
/llama-bench
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -296,7 +296,6 @@ if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED) find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
@ -313,7 +312,6 @@ if (LLAMA_METAL)
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK} ${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
) )
endif() endif()
@ -499,9 +497,11 @@ else()
endif() endif()
# #
# Build libraries # libraries
# #
# ggml
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h ggml.h
@ -526,6 +526,8 @@ if (BUILD_SHARED_LIBS)
install(TARGETS ggml_shared LIBRARY) install(TARGETS ggml_shared LIBRARY)
endif() endif()
# llama
add_library(llama add_library(llama
llama.cpp llama.cpp
llama.h llama.h
@ -547,6 +549,10 @@ if (BUILD_SHARED_LIBS)
install(TARGETS llama LIBRARY) install(TARGETS llama LIBRARY)
endif() endif()
#
# install
#
include(GNUInstallDirs) include(GNUInstallDirs)
install( install(
FILES convert.py FILES convert.py
@ -570,11 +576,23 @@ install(
WORLD_READ WORLD_READ
WORLD_EXECUTE WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR}) DESTINATION ${CMAKE_INSTALL_BINDIR})
if (LLAMA_METAL)
install(
FILES ggml-metal.metal
PERMISSIONS
OWNER_READ
OWNER_WRITE
GROUP_READ
WORLD_READ
DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
# #
# programs, examples and tests # programs, examples and tests
# #
add_subdirectory(common)
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)

View file

@ -1,8 +1,8 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
@ -46,7 +46,7 @@ else
OPT = -O3 OPT = -O3
endif endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
LDFLAGS = LDFLAGS =
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
@ -283,7 +283,7 @@ endif # LLAMA_CLBLAST
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
endif # LLAMA_METAL endif # LLAMA_METAL
@ -332,20 +332,20 @@ OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h common.o: common/common.cpp common/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
console.o: examples/console.cpp examples/console.h console.o: common/console.cpp common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS) libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf build-info.h $(TEST_TARGETS) rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
# #
# Examples # Examples
@ -388,15 +388,15 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gptneox-main: gptneox-main.cpp ggml.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
@ -418,7 +418,10 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)

View file

@ -96,6 +96,7 @@ as the main playground for developing new features for the [ggml](https://github
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
@ -238,12 +239,17 @@ In order to build llama.cpp you have three different options.
cmake --build . --config Release cmake --build . --config Release
``` ```
- Using `Zig`: - Using `Zig` (version 0.11 or later):
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
it's also possible to cross compile for other operating systems and architectures:
```bash ```bash
zig build -Doptimize=ReleaseFast zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
``` ```
The `zig targets` command will give you valid options to use.
- Using `gmake` (FreeBSD): - Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -284,7 +290,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
Any value larger than 0 will offload the computation to the GPU. For example: Any value larger than 0 will offload the computation to the GPU. For example:
```bash ```bash
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
``` ```
### MPI Build ### MPI Build
@ -323,7 +329,7 @@ The above will distribute the computation across 2 processes on the first host a
Finally, you're ready to run a computation using `mpirun`: Finally, you're ready to run a computation using `mpirun`:
```bash ```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
``` ```
### BLAS Build ### BLAS Build
@ -408,7 +414,7 @@ Building the program with BLAS support may lead to some performance improvements
|-------------------------|------------------------|---------|-------------| |-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
@ -506,10 +512,10 @@ python3 convert.py models/7B/
python convert.py models/7B/ --vocabtype bpe python convert.py models/7B/ --vocabtype bpe
# quantize the model to 4-bits (using q4_0 method) # quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0 ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
# run the inference # run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -565,7 +571,7 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh ./examples/chat-13B.sh
# custom arguments using a 13B model # custom arguments using a 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
``` ```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@ -628,6 +634,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
### Using [GPT4All](https://github.com/nomic-ai/gpt4all) ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
*Note: these instructions are likely obsoleted by the GGUF update*
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models` - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models` - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@ -703,7 +711,7 @@ If your issue is with model generation quality, then please at least scan the fo
#### How to run #### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output: 3. Output:
``` ```
perplexity : calculating perplexity over 655 chunks perplexity : calculating perplexity over 655 chunks
@ -802,13 +810,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
On completion, you are ready to play! On completion, you are ready to play!
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
or with a light image: or with a light image:
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
### Docker With CUDA ### Docker With CUDA
@ -839,8 +847,8 @@ The resulting images, are essentially the same as the non-CUDA images:
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash ```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
``` ```
### Contributing ### Contributing

View file

@ -1,5 +1,6 @@
// Compatible with Zig Version 0.11.0 // Compatible with Zig Version 0.11.0
const std = @import("std"); const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile; const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader; const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode; const Mode = std.builtin.Mode;
@ -10,11 +11,31 @@ const Maker = struct {
target: CrossTarget, target: CrossTarget,
optimize: Mode, optimize: Mode,
config_header: *ConfigHeader, config_header: *ConfigHeader,
enable_lto: bool,
const cflags = .{"-std=c11"}; include_dirs: ArrayList([]const u8),
const cxxflags = .{"-std=c++11"}; cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn init(builder: *std.build.Builder) Maker { fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const commit_hash = @embedFile(".git/refs/heads/master"); const commit_hash = @embedFile(".git/refs/heads/master");
const config_header = builder.addConfigHeader( const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" }, .{ .style = .blank, .include_path = "build-info.h" },
@ -23,58 +44,71 @@ const Maker = struct {
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
}, },
); );
return Maker{ var m = Maker{
.builder = builder, .builder = builder,
.target = builder.standardTargetOptions(.{}), .target = builder.standardTargetOptions(.{}),
.optimize = builder.standardOptimizeOption(.{}), .optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header, .config_header = config_header,
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
}; };
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"examples"});
return m;
} }
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (std.mem.endsWith(u8, src, ".c")) { if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, &cflags); o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC(); o.linkLibC();
} else { } else {
o.addCSourceFiles(&.{src}, &cxxflags); o.addCSourceFiles(&.{src}, m.cxxflags.items);
o.linkLibCpp(); o.linkLibCpp();
} }
o.addIncludePath(.{ .path = "." }); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.addIncludePath(.{ .path = "./examples" }); o.want_lto = m.enable_lto;
return o; return o;
} }
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addIncludePath(.{ .path = "." }); e.addCSourceFiles(&.{src}, m.cxxflags.items);
e.addIncludePath(.{ .path = "./examples" });
e.addCSourceFiles(&.{src}, &cxxflags);
for (deps) |d| e.addObject(d); for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
e.linkLibC(); e.linkLibC();
e.linkLibCpp(); e.linkLibCpp();
e.addConfigHeader(m.config_header); e.addConfigHeader(m.config_header);
m.builder.installArtifact(e); m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
// Currently a bug is preventing correct linking for optimized builds for Windows:
// https://github.com/ziglang/zig/issues/15958
if (e.target.isWindows()) {
e.want_lto = false;
}
return e; return e;
} }
}; };
pub fn build(b: *std.build.Builder) void { pub fn build(b: *std.build.Builder) !void {
const make = Maker.init(b); var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c"); const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const llama = make.obj("llama", "llama.cpp"); const llama = make.obj("llama", "llama.cpp");
const common = make.obj("common", "examples/common.cpp"); const common = make.obj("common", "examples/common.cpp");
const console = make.obj("common", "examples/console.cpp");
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });

View file

@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
python3 ../convert.py ${path_models} python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.bin" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.bin" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.bin" model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.bin" model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.bin" model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.bin" model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.bin" model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.bin" model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.bin" model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.bin" model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.bin" model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw" wiki_test_60="${path_wiki}/wiki.test-60.raw"
@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
python3 ../convert.py ${path_models} python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.bin" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.bin" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.bin" model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.bin" model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.bin" model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.bin" model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.bin" model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.bin" model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.bin" model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.bin" model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.bin" model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"

20
common/CMakeLists.txt Normal file
View file

@ -0,0 +1,20 @@
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
console.h
console.cpp
grammar-parser.h
grammar-parser.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)

View file

@ -262,6 +262,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.cfg_negative_prompt = argv[i]; params.cfg_negative_prompt = argv[i];
} else if (arg == "--cfg-negative-prompt-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
if (params.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back();
}
} else if (arg == "--cfg-scale") { } else if (arg == "--cfg-scale") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -412,7 +427,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
params.hellaswag_tasks = std::stoi(argv[i]); params.hellaswag_tasks = std::stoi(argv[i]);
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.logit_bias[llama_token_eos()] = -INFINITY; params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") { } else if (arg == "--no-penalize-nl") {
params.penalize_nl = false; params.penalize_nl = false;
} else if (arg == "-l" || arg == "--logit-bias") { } else if (arg == "-l" || arg == "--logit-bias") {
@ -553,8 +568,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
@ -619,6 +636,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
return "The"; return "The";
} }
//
// Model utils
//
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params(); auto lparams = llama_context_default_params();
@ -641,7 +662,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return lparams; return lparams;
} }
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) { std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
auto lparams = llama_context_params_from_gpt_params(params); auto lparams = llama_context_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
@ -670,5 +691,77 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
} }
} }
if (params.ignore_eos) {
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
}
return std::make_tuple(model, lctx); return std::make_tuple(model, lctx);
} }
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_str(ctx, token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos) {
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}

View file

@ -2,7 +2,6 @@
#pragma once #pragma once
#define LLAMA_API_CPP // TODO: eliminate me
#include "llama.h" #include "llama.h"
#include <string> #include <string>
@ -33,7 +32,6 @@ struct gpt_params {
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
// sampling parameters // sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
@ -47,12 +45,14 @@ struct gpt_params {
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
// Classifier-Free Guidance // Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806 // https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model.bin"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string prompt = ""; std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
@ -82,6 +82,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt bool perplexity = false; // compute perplexity over the prompt
@ -103,5 +104,27 @@ std::string gpt_random_prompt(std::mt19937 & rng);
// Model utils // Model utils
// //
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params); std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::string llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);

View file

@ -0,0 +1,282 @@
# HF falcon--> gguf conversion
import gguf
import os
import sys
import struct
import json
import numpy as np
import torch
from typing import Any, List
from pathlib import Path
from transformers import AutoTokenizer
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.FALCON
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name(last_dir)
gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: List[str] = []
merges: List[str] = []
if Path(dir_model + "/tokenizer.json").is_file():
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer merges")
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
merges = tokenizer_json["model"]["merges"]
gguf_writer.add_token_merges(merges)
print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
gguf_writer.add_token_list(tokens)
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
print("gguf: get special token ids")
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
# find special token ids
if "bos_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["bos_token"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["eos_token"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["unk_token"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]:
gguf_writer.add_pad_token_id(key["id"])
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print("gguf: model successfully exported to '" + fname_out + "'")
print("")

View file

@ -13,6 +13,8 @@ from pathlib import Path
from transformers import AutoTokenizer from transformers import AutoTokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode(): def bytes_to_unicode():
""" """
Returns list of utf-8 byte and a corresponding list of unicode strings. Returns list of utf-8 byte and a corresponding list of unicode strings.
@ -34,6 +36,7 @@ def bytes_to_unicode():
cs = [chr(n) for n in cs] cs = [chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: str) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
@ -44,6 +47,7 @@ def count_model_parts(dir_model: str) -> int:
print("gguf: found " + str(num_parts) + " model parts") print("gguf: found " + str(num_parts) + " model parts")
return num_parts return num_parts
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32") print(" ftype == 0 -> float32")
@ -58,7 +62,7 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
#
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
@ -67,6 +71,7 @@ if len(sys.argv) > 2:
ftype = int(sys.argv[2]) ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1: if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype)) print("Invalid ftype: " + str(ftype))
sys.exit(1) sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
@ -77,30 +82,29 @@ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "GPTNeoXForCausalLM": if hparams["architectures"][0] != "GPTNeoXForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0] ) print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit() sys.exit()
# get number of model parts # get number of model parts
num_parts = count_model_parts(dir_model) num_parts = count_model_parts(dir_model)
gguf_writer = gguf.GGUFWriter.open(fname_out) ARCH=gguf.MODEL_ARCH.GPTNEOX
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
llm_arch = "gptneox"
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32") gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_block_count(block_count)
gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"]) gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
gguf_writer.add_rope_dimension_count(llm_arch, int( hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])) ) gguf_writer.add_head_count(hparams["num_attention_heads"])
gguf_writer.add_head_count(llm_arch, hparams["num_attention_heads"]) gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
gguf_writer.add_parallel_residual(llm_arch, hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
gguf_writer.add_layer_norm_eps(llm_arch, hparams["layer_norm_eps"])
# TOKENIZATION # TOKENIZATION
@ -124,14 +128,14 @@ if Path(dir_model + "/tokenizer.json").is_file():
print("gguf: get gpt2 tokenizer vocab") print("gguf: get gpt2 tokenizer vocab")
vocab_size = len( tokenizer_json["model"]["vocab"] ) vocab_size = len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model) tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode() byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()} byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size): for i in range(vocab_size):
if i in reverse_vocab: if i in reverse_vocab:
@ -146,8 +150,9 @@ if Path(dir_model + "/tokenizer.json").is_file():
text.extend(c.encode('utf-8')) text.extend(c.encode('utf-8'))
else: else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
padding_token = f"[PAD{i}]".encode("utf8") pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(padding_token) text = bytearray(pad_token)
tokens.append(text) tokens.append(text)
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
@ -188,7 +193,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = gguf.get_tensor_name_map(block_count) tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
@ -201,7 +206,7 @@ else:
) )
for part_name in part_names: for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys(): for name in model_part.keys():
@ -211,71 +216,8 @@ for part_name in part_names:
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue continue
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data_dtype = np.float32
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data.dtype == np.float16 and n_dims == 1:
data_dtype = np.float32
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data_dtype = np.float16
data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensor metadata")
gguf_writer.write_ti_data_to_file()
# tensor data
print("gguf: convert and write tensor data")
if num_parts == 0:
part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype old_dtype = data.dtype
# we don't need these
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
# convert any unsupported data types to float32 # convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32: if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32) data = data.to(torch.float32)
@ -288,14 +230,14 @@ for part_name in part_names:
elif name.endswith(".bias") and name[:-5] in tensor_map: elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias" name = tensor_map[name[:-5]] + ".bias"
else: else:
print( "Can not map tensor '" + name + "'" ) print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16: if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32) data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
@ -306,12 +248,19 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data) gguf_writer.add_tensor(name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print("gguf: model successfully exported to '" + fname_out + "'")
print("gguf: model successfully exported to '" + fname_out + "'" )
print("") print("")

View file

@ -18,6 +18,7 @@ from sentencepiece import SentencePieceProcessor
# compatible with python < 3.9 # compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: str) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
@ -28,10 +29,12 @@ def count_model_parts(dir_model: str) -> int:
print("gguf: found " + str(num_parts) + " model parts") print("gguf: found " + str(num_parts) + " model parts")
return num_parts return num_parts
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32") print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16") print(" ftype == 1 -> float16")
sys.exit(1) sys.exit(1)
@ -43,7 +46,7 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
#
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
@ -52,6 +55,7 @@ if len(sys.argv) > 2:
ftype = int(sys.argv[2]) ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1: if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype)) print("Invalid ftype: " + str(ftype))
sys.exit(1) sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
@ -70,14 +74,15 @@ num_parts = count_model_parts(dir_model)
if num_parts > 1: if num_parts > 1:
print("gguf: Only models with a single datafile are supported.") print("gguf: Only models with a single datafile are supported.")
sys.exit() sys.exit()
gguf_writer = gguf.GGUFWriter.open(fname_out) ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
llm_arch = "llama"
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"] head_count = hparams["num_attention_heads"]
@ -89,21 +94,19 @@ else:
if "_name_or_path" in hparams: if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"] hf_repo = hparams["_name_or_path"]
else: else:
hf_repo="" hf_repo = ""
gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth") gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"]) gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(llm_arch, hparams["hidden_size"] // hparams["num_attention_heads"]) gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(llm_arch, head_count) gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(llm_arch, head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(llm_arch, hparams["rms_norm_eps"]) gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
# TOKENIZATION # TOKENIZATION
@ -129,15 +132,19 @@ if Path(dir_model + "/tokenizer.model").is_file():
score = tokenizer.get_score(i) score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_unknown(i):
if tokenizer.is_control(i): toktype = 3 toktype = 2
if tokenizer.is_control(i):
toktype = 3
# TODO: How to determinate if a token is user defined? # TODO: How to determinate if a token is user defined?
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4 # if tokenizer.is_user_defined(i): toktype = 4
if tokenizer.is_unused(i): toktype = 5 if tokenizer.is_unused(i):
if tokenizer.is_byte(i): toktype = 6 toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text) tokens.append(text)
scores.append(score) scores.append(score)
@ -188,15 +195,15 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = gguf.get_tensor_name_map(block_count) tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
part_names = ( f"consolidated.{n:02}.pth" for n in range(0, num_parts) ) part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
for part_name in part_names: for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys(): for name in model_part.keys():
@ -206,66 +213,8 @@ for part_name in part_names:
if name == "rope.freqs": if name == "rope.freqs":
continue continue
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data_dtype = np.float32
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data_dtype = np.float32
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data_dtype = np.float16
data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensor metadata")
gguf_writer.write_ti_data_to_file()
# tensor data
print("gguf: convert and write tensor data")
part_names = ( f"consolidated.{n:02}.pth" for n in range(0, num_parts) )
for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype old_dtype = data.dtype
# we don't need these
if name == "rope.freqs":
continue
# convert any unsupported data types to float32 # convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32: if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32) data = data.to(torch.float32)
@ -278,14 +227,14 @@ for part_name in part_names:
elif name.endswith(".bias") and name[:-5] in tensor_map: elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias" name = tensor_map[name[:-5]] + ".bias"
else: else:
print( "Can not map tensor '" + name + "'" ) print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16: if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32) data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
@ -296,9 +245,17 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data) gguf_writer.add_tensor(name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()

View file

@ -18,26 +18,35 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout # reverse HF permute back to original pth layout
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2) .swapaxes(1, 2)
.reshape(weights.shape)) .reshape(weights.shape))
def count_model_parts(dir_model: str) -> int: def count_model_parts(dir_model: str) -> int:
num_parts = 0 num_parts = 0
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"): if filename.startswith("pytorch_model-"):
num_parts += 1 num_parts += 1
if num_parts > 0: if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts") print("gguf: found " + str(num_parts) + " model parts")
return num_parts return num_parts
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32") print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16") print(" ftype == 1 -> float16")
sys.exit(1) sys.exit(1)
@ -49,7 +58,8 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
#
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
@ -58,6 +68,7 @@ if len(sys.argv) > 2:
ftype = int(sys.argv[2]) ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1: if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype)) print("Invalid ftype: " + str(ftype))
sys.exit(1) sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
@ -69,17 +80,17 @@ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
if hparams["architectures"][0] != "LlamaForCausalLM": if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0]) print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit() sys.exit()
# get number of model parts # get number of model parts
num_parts = count_model_parts(dir_model) num_parts = count_model_parts(dir_model)
gguf_writer = gguf.GGUFWriter.open(fname_out) ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
llm_arch = "llama"
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"] head_count = hparams["num_attention_heads"]
@ -91,7 +102,7 @@ else:
if "_name_or_path" in hparams: if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"] hf_repo = hparams["_name_or_path"]
else: else:
hf_repo="" hf_repo = ""
if "max_sequence_length" in hparams: if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"] ctx_length = hparams["max_sequence_length"]
@ -99,22 +110,21 @@ elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"] ctx_length = hparams["max_position_embeddings"]
else: else:
print("gguf: can not find ctx length parameter.") print("gguf: can not find ctx length parameter.")
sys.exit() sys.exit()
gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth") gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(llm_arch, ctx_length) gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(llm_arch, block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"]) gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(llm_arch, hparams["hidden_size"] // hparams["num_attention_heads"]) gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(llm_arch, head_count) gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(llm_arch, head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(llm_arch, hparams["rms_norm_eps"]) gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
# TOKENIZATION # TOKENIZATION
@ -140,15 +150,19 @@ if Path(dir_model + "/tokenizer.model").is_file():
score = tokenizer.get_score(i) score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_unknown(i):
if tokenizer.is_control(i): toktype = 3 toktype = 2
if tokenizer.is_control(i):
toktype = 3
# TODO: How to determinate if a token is user defined? # TODO: How to determinate if a token is user defined?
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4 # if tokenizer.is_user_defined(i): toktype = 4
if tokenizer.is_unused(i): toktype = 5 if tokenizer.is_unused(i):
if tokenizer.is_byte(i): toktype = 6 toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text) tokens.append(text)
scores.append(score) scores.append(score)
@ -199,7 +213,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS # TENSORS
tensor_map = gguf.get_tensor_name_map(block_count) tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
@ -212,7 +226,7 @@ else:
) )
for part_name in part_names: for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys(): for name in model_part.keys():
@ -222,75 +236,8 @@ for part_name in part_names:
if name.endswith(".rotary_emb.inv_freq"): if name.endswith(".rotary_emb.inv_freq"):
continue continue
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# reverse permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data_dtype = np.float32
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data_dtype = np.float32
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data_dtype = np.float16
data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensor metadata")
gguf_writer.write_ti_data_to_file()
# tensor data
print("gguf: convert and write tensor data")
if num_parts == 0:
part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
print("gguf: loading model part '"+ part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype old_dtype = data.dtype
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
# convert any unsupported data types to float32 # convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32: if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32) data = data.to(torch.float32)
@ -307,14 +254,14 @@ for part_name in part_names:
elif name.endswith(".bias") and name[:-5] in tensor_map: elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias" name = tensor_map[name[:-5]] + ".bias"
else: else:
print( "Can not map tensor '" + name + "'" ) print("Can not map tensor '" + name + "'")
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16: if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32) data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
@ -325,9 +272,17 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16) data = data.astype(np.float16)
print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data) gguf_writer.add_tensor(name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()

View file

@ -1,972 +0,0 @@
#!/usr/bin/env python
import argparse
import concurrent.futures
import copy
import enum
import faulthandler
import functools
import io
import itertools
import json
import math
import mmap
import pickle
import re
import signal
import struct
import sys
import zipfile
import numpy as np
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
from sentencepiece import SentencePieceProcessor # type: ignore
if TYPE_CHECKING:
from typing_extensions import TypeAlias
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@dataclass(frozen=True)
class UnquantizedDataType:
name: str
DT_F16 = UnquantizedDataType('F16')
DT_F32 = UnquantizedDataType('F32')
DT_I32 = UnquantizedDataType('I32')
DT_BF16 = UnquantizedDataType('BF16')
DataType = Union[UnquantizedDataType]
DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
DT_F32: 0,
DT_F16: 1,
}
FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
{ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
DT_BF16: np.dtype(np.uint16),
DT_F16: np.dtype(np.float16),
DT_F32: np.dtype(np.float32),
DT_I32: np.dtype(np.int32),
}
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
{dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
class GGMLFileType(enum.Enum):
AllF32 = 0
MostlyF16 = 1 # except 1d tensors
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
if len(tensor.shape) == 1:
# 1D tensors are always F32.
return DT_F32
elif self == GGMLFileType.AllF32:
return DT_F32
elif self == GGMLFileType.MostlyF16:
return DT_F16
else:
raise ValueError(self)
# TODO: this is LLaMA specific
def make_tensors_list() -> List[str]:
ret = [
'tok_embeddings.weight',
'norm.weight',
'output.weight',
]
for i in range(80): # maximum number of layer
ret += [
f'layers.{i}.attention.wq.weight',
f'layers.{i}.attention.wk.weight',
f'layers.{i}.attention.wv.weight',
f'layers.{i}.attention.wo.weight',
f'layers.{i}.attention_norm.weight',
f'layers.{i}.feed_forward.w1.weight',
f'layers.{i}.feed_forward.w2.weight',
f'layers.{i}.feed_forward.w3.weight',
f'layers.{i}.ffn_norm.weight',
]
return ret
# TODO: this should be generalized for non-LLaMA models
TENSORS_LIST = make_tensors_list()
TENSORS_SET = set(TENSORS_LIST)
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@dataclass
class Params:
n_vocab: int
n_embd: int
n_mult: int
n_head: int
n_layer: int
n_kv_head: Optional[int] # This parameter is only used for Llama 2
@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
# try transformer naming first
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_head=n_embd // 128 # guessed
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = 256,
n_head = n_head,
n_layer = n_layer,
n_kv_head = None,
)
@staticmethod
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"];
n_embd = config["hidden_size"];
n_head = config["num_attention_heads"];
n_layer = config["num_hidden_layers"];
n_ff = config["intermediate_size"];
n_kv_head = config.get("num_key_value_heads")
n_mult = find_n_mult(n_ff, n_embd);
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_kv_head = n_kv_head,
)
# LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
@staticmethod
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"];
n_embd = config["dim"];
n_head = config["n_heads"];
n_layer = config["n_layers"];
n_mult = config["multiple_of"];
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_kv_head = None,
)
@staticmethod
def load(model_plus: 'ModelPlus') -> 'Params':
hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json"
if hf_config_path.exists():
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists():
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
else:
params = Params.guessed(model_plus.model)
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
return params
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
added_tokens: Dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.bpe_tokenizer
from transformers.models.gpt2 import tokenization_gpt2
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: Dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
Vocab = Union[BpeVocab, SentencePieceVocab]
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
class Tensor(metaclass=ABCMeta):
data_type: DataType
@abstractmethod
def astype(self, data_type: DataType) -> 'Tensor': ...
@abstractmethod
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ...
@abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
fp32_arr = bf16_arr.astype(np.uint32) << 16
return fp32_arr.view(np.float32)
class UnquantizedTensor(Tensor):
def __init__(self, ndarray: NDArray) -> None:
assert isinstance(ndarray, np.ndarray)
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
def astype(self, data_type: DataType) -> Tensor:
dtype = DATA_TYPE_TO_NUMPY[data_type]
if self.data_type == DT_BF16:
self.ndarray = bf16_to_fp32(self.ndarray)
return UnquantizedTensor(self.ndarray.astype(dtype))
def to_ggml(self) -> 'UnquantizedTensor':
return self
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
tensor = lazy_tensor.load()
assert isinstance(tensor, UnquantizedTensor)
# double-check:
actual_shape = list(tensor.ndarray.shape)
assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
if convert:
tensor.ndarray = tensor.ndarray.astype(expected_dtype)
else:
raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
return tensor.ndarray
GGMLCompatibleTensor = Union[UnquantizedTensor]
class DeferredPermutedTensor(Tensor):
def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
self.base = base
self.n_head = n_head
self.data_type = self.base.data_type
def astype(self, data_type: DataType) -> Tensor:
return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
def to_ggml(self) -> GGMLCompatibleTensor:
return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
raise Exception("shouldn't permute twice")
@dataclass
class LazyTensor:
_load: Callable[[], Tensor]
shape: List[int]
data_type: DataType
description: str
def load(self) -> Tensor:
ret = self._load()
assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
return ret
def astype(self, data_type: DataType) -> 'LazyTensor':
self.validate_conversion_to(data_type)
def load() -> Tensor:
return self.load().astype(data_type)
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
def validate_conversion_to(self, data_type: DataType) -> None:
if data_type == self.data_type:
return
LazyModel = Dict[str, LazyTensor]
@dataclass
class ModelPlus:
model: LazyModel
paths: List[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors']
vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
def merge_sharded(models: List[LazyModel]) -> LazyModel:
# Original LLaMA models have each file contain one part of each tensor.
# Use a dict instead of a set to preserve order.
names = {name: None for model in models for name in model}
def convert(name: str) -> LazyTensor:
lazy_tensors: List[LazyTensor] = [model[name] for model in models]
if len(lazy_tensors) == 1:
# only one file; don't go through this procedure since there might
# be quantized tensors
return lazy_tensors[0]
if len(lazy_tensors[0].shape) == 1:
# the tensor is just duplicated in every file
return lazy_tensors[0]
if name.startswith('tok_embeddings.') or \
name.endswith('.attention.wo.weight') or \
name.endswith('.feed_forward.w2.weight'):
# split by columns
axis = 1
else:
# split by rows
axis = 0
concatenated_shape = list(lazy_tensors[0].shape)
concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
def load() -> UnquantizedTensor:
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
return UnquantizedTensor(concatenated)
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
return {name: convert(name) for name in names}
def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
formats = set(mp.format for mp in models_plus)
assert len(formats) == 1, "different formats?"
format = formats.pop()
paths = [path for mp in models_plus for path in mp.paths]
# Use the first non-None vocab, if any.
try:
vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
except StopIteration:
vocab = None
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
# Transformers models put different tensors in different files, but
# don't split indivdual tensors between files.
model: LazyModel = {}
for mp in models_plus:
model.update(mp.model)
else:
model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab)
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute(n_head, n_kv_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {}
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
out["norm.weight"] = model["model.norm.weight"]
out["output.weight"] = model["lm_head.weight"]
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else:
break
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
return out
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
# - https://github.com/pytorch/pytorch/issues/64327
# This allows us to de-shard without multiplying RAM usage, and also
# conveniently drops the PyTorch dependency (though we still need numpy).
@dataclass
class LazyStorageKind:
data_type: DataType
@dataclass
class LazyStorage:
load: Callable[[int, int], NDArray]
kind: LazyStorageKind
description: str
class LazyUnpickler(pickle.Unpickler):
def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
super().__init__(fp)
self.data_base_path = data_base_path
self.zip_file = zip_file
def persistent_load(self, pid: Any) -> Any:
assert pid[0] == 'storage'
assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type
filename_stem = pid[2]
filename = self.data_base_path + '/' + filename_stem
info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray:
dtype = DATA_TYPE_TO_NUMPY.get(data_type)
if dtype is None:
raise Exception("tensor stored in unsupported format")
fp = self.zip_file.open(info)
fp.seek(offset * dtype.itemsize)
size = elm_count * dtype.itemsize
data = fp.read(size)
assert len(data) == size
return np.frombuffer(data, dtype)
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
return LazyStorage(load=load, kind=pid[1], description=description)
# @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage)
def load() -> UnquantizedTensor:
elm_count = stride[0] * size[0]
return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
description = f'pickled storage_offset={storage_offset} in {storage.description}'
return LazyTensor(load, list(size), storage.kind.data_type, description)
# @staticmethod
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES: Dict[Any, Any] = {
('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
('torch', 'IntStorage'): LazyStorageKind(DT_I32),
('torch', 'Tensor'): LazyTensor,
}
def find_class(self, module: str, name: str) -> Any:
if not module.startswith('torch'):
return super().find_class(module, name)
return self.CLASSES[(module, name)]
def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
zf = zipfile.ZipFile(outer_fp)
pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
assert len(pickle_paths) == 1, pickle_paths
pickle_fp = zf.open(pickle_paths[0], 'r')
unpickler = LazyUnpickler(pickle_fp,
data_base_path=pickle_paths[0][:-4],
zip_file=zf)
model = unpickler.load()
as_dict = dict(model.items())
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
'BF16': DT_BF16,
'F16': DT_F16,
'F32': DT_F32,
'I32': DT_I32,
}
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
header_size, = struct.unpack('<Q', fp.read(8))
header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
# Use mmap for the actual data to avoid race conditions with the file offset.
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
byte_buf = mapped[8 + header_size:]
def convert(info: Dict[str, Any]) -> LazyTensor:
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
shape: List[int] = info['shape']
begin, end = info['data_offsets']
assert 0 <= begin <= end <= len(byte_buf)
assert end - begin == math.prod(shape) * numpy_dtype.itemsize
buf = byte_buf[begin:end]
def load() -> UnquantizedTensor:
return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
return LazyTensor(load, shape, data_type, description)
model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
def must_read(fp: IO[bytes], length: int) -> bytes:
ret = fp.read(length)
if len(ret) < length:
raise Exception("unexpectedly reached end of file")
return ret
@functools.lru_cache(maxsize=None)
def lazy_load_file(path: Path) -> ModelPlus:
fp = open(path, 'rb')
first8 = fp.read(8)
fp.seek(0)
if first8[:2] == b'PK':
# A zip file, i.e. PyTorch format
return lazy_load_torch_file(fp, path)
elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
# Probably safetensors
return lazy_load_safetensors_file(fp, path)
else:
raise ValueError(f"unknown format: {path}")
In = TypeVar('In')
Out = TypeVar('Out')
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than
letting results pile up in memory. Specifically, there is a max of one
output value buffered per thread.'''
with concurrent.futures.ThreadPoolExecutor() as executor:
futures: List[concurrent.futures.Future[Out]] = []
items_rev = list(iterable)[::-1]
for i in range(min(concurrency, len(items_rev))):
futures.append(executor.submit(func, items_rev.pop()))
while futures:
result = futures.pop(0).result()
if items_rev:
futures.append(executor.submit(func, items_rev.pop()))
yield result
def check_vocab_size(params: Params, vocab: Vocab) -> None:
if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = []
vocab.vocab_size = vocab.vocab_size_base
return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
if vocab.fname_added_tokens is not None:
msg += f" combined with {vocab.fname_added_tokens}"
msg += f" has {vocab.vocab_size})."
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
raise Exception(msg)
class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.fout = open(fname_out, "wb")
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
self.fout.write(b"ggjt"[::-1]) # magic
values = [
1, # file version
params.n_vocab,
params.n_embd,
params.n_mult,
params.n_head,
params.n_layer,
params.n_embd // params.n_head, # rot (obsolete)
file_type.value,
]
self.fout.write(struct.pack("i" * len(values), *values))
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
sname = name.encode('utf-8')
self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
self.fout.write(sname)
self.fout.seek((self.fout.tell() + 31) & -32)
def write_vocab(self, vocab: Vocab) -> None:
for text, score in vocab.all_tokens():
self.fout.write(struct.pack("i", len(text)))
self.fout.write(text)
self.fout.write(struct.pack("f", score))
@staticmethod
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
of = OutputFile(fname_out)
of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab)
of.fout.close()
@staticmethod
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out)
of.write_file_header(params, file_type)
print("Writing vocab...")
of.write_vocab(vocab)
def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
name, lazy_tensor = item
return lazy_tensor.load().to_ggml().ndarray
ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
ndarray.tofile(of.fout)
of.fout.close()
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
wq_type = model["layers.0.attention.wq.weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
return GGMLFileType.AllF32
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
return GGMLFileType.MostlyF16
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
raise Exception(f"Unexpected combination of types: {name_to_type}")
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
if "lm_head.weight" in model:
model = convert_transformers_to_orig(model, params)
model = filter_and_sort_tensors(model)
return model
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()}
def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model.
'''
# Support the following patterns:
patterns: List[Tuple[str, str]] = [
# - x.00.pth, x.01.pth, etc.
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
(r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
# x.bin, x.bin.1, etc.
(r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
]
for regex, replacement in patterns:
if re.search(regex, path.name):
new_path = path.with_name(re.sub(regex, replacement, path.name))
if new_path.exists():
return new_path
return None
def find_multifile_paths(path: Path) -> List[Path]:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the whole list of paths in the model.
'''
ret: List[Path] = []
for i in itertools.count():
nth_path = nth_multifile_path(path, i)
if nth_path is None:
break
ret.append(nth_path)
if not ret:
# No matches. This should only happen if the file was named, e.g.,
# foo.0, and there was no file named foo. Oh well, try to process it
# as a single file.
return [path]
return ret
def load_some_model(path: Path) -> ModelPlus:
'''Load a model of any supported format.'''
# Be extra-friendly and accept either a file or a directory:
if path.is_dir():
# Check if it's a set of safetensors files first
files = list(path.glob("model-00001-of-*.safetensors"))
if not files:
# Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try GGML too, but with lower priority, since if both a non-GGML
# model and a GGML model exist in the same directory, we assume the
# latter was converted from the former.
files = list(path.glob("ggml-model*.bin*"))
if not files:
raise Exception(f"Can't find model in directory {path}")
if len(files) > 1:
raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
path = files[0]
paths = find_multifile_paths(path)
models_plus: List[ModelPlus] = []
for path in paths:
print(f"Loading model file {path}")
models_plus.append(lazy_load_file(path))
model_plus = merge_multifile_models(models_plus)
return model_plus
def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
return {name: model[name] for name in TENSORS_LIST if name in model}
def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
print(f"vocabtype: {vocabtype}")
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
if path.is_dir():
vocab_file = "tokenizer.model"
if vocabtype == 'bpe':
vocab_file = "vocab.json"
path2 = path / vocab_file
# Use `.parent` instead of /.. to handle the symlink case better.
path3 = path.parent / vocab_file
if path2.exists():
path = path2
elif path3.exists():
path = path3
else:
raise FileNotFoundError(
f"Could not find tokenizer.model in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
added_tokens_path = path.parent / "added_tokens.json"
print(f"Loading vocab file {path}")
if vocabtype == "bpe":
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
elif vocabtype == "spm":
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
}[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
if ret in model_paths:
sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n")
sys.exit(1)
return ret
def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.paths = {model_plus.paths!r}")
print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}")
for name, lazy_tensor in model_plus.model.items():
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)")
args = parser.parse_args(args_in)
vocab: Vocab
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
elif args.vocab_only:
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
assert args.outfile, "need --outfile if using --vocab-only"
outfile = args.outfile
OutputFile.write_vocab_only(outfile, vocab)
print(f"Wrote {outfile}")
else:
model_plus = load_some_model(args.model)
if args.dump:
do_dump_model(model_plus)
return
if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab
else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
params = Params.load(model_plus)
model = model_plus.model
model = do_necessary_conversions(model, params)
output_type = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, output_type)
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
OutputFile.write_all(outfile, params, output_type, model, vocab)
print(f"Wrote {outfile}")
if __name__ == '__main__':
main()

824
convert.py Normal file → Executable file

File diff suppressed because it is too large Load diff

View file

@ -3,7 +3,7 @@
## Verifying that the model is running on the GPU with cuBLAS ## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell ```shell
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some " ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
``` ```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
CPU: 7 physical cores CPU: 7 physical cores
RAM: 32GB RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML) Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result: Result:

View file

@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
# ... # ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
console.h
console.cpp
grammar-parser.h
grammar-parser.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples # examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@ -45,6 +24,7 @@ else()
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties // TODO: Apply penalties
// float nl_logit = logits[llama_token_nl()]; // float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p, // llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence); // last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) { // if (!penalize_nl) {
// logits[llama_token_nl()] = nl_logit; // logits[llama_token_nl(ctx)] = nl_logit;
// } // }
if (temp <= 0) { if (temp <= 0) {
@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx; llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel); int id = sampling_id(mymodel);
static std::string ret; static std::string ret;
if (id == llama_token_eos()) { if (id == llama_token_eos(ctx)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = llama_token_to_str(ctx, id); ret = llama_token_to_str(ctx, id);

View file

@ -233,16 +233,13 @@ int main(int argc, char ** argv) {
const std::string fname(argv[1]); const std::string fname(argv[1]);
const std::string mode (argv[2]); const std::string mode (argv[2]);
GGML_ASSERT((mode == "r" || mode == "w" || mode == "q") && "mode must be r, w or q"); GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
if (mode == "w") { if (mode == "w") {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") { } else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
} else if (mode == "q") {
llama_model_quantize_params params = llama_model_quantize_default_params();
llama_model_quantize(fname.c_str(), "quant.gguf", &params);
} }
return 0; return 0;

View file

@ -14,6 +14,7 @@
//----- //-----
// Unicode GPT2 Byte Pair Encoding Tokenizer // Unicode GPT2 Byte Pair Encoding Tokenizer
// Adapted from https://github.com/cmp-nct/ggllm.cpp // Adapted from https://github.com/cmp-nct/ggllm.cpp
// Removed loading of merges from HF json and parts made for a specific vocab
//----- //-----
// Unicode library (from cmpnct_unicode.cpp) // Unicode library (from cmpnct_unicode.cpp)
@ -439,23 +440,18 @@ private:
struct gpt2bpe_vocab { struct gpt2bpe_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
std::map<std::string, uint32_t> max_token_length; // max length, for each 2byte prefix
std::map<std::string, uint32_t> max_token_length; // max length, for each 2byte prefix
std::map<std::pair<std::string,std::string>, int> bpe_ranks; std::map<std::pair<std::string,std::string>, int> bpe_ranks;
std::vector<std::pair<std::string, std::string>> bpe_merges; std::vector<std::pair<std::string, std::string>> bpe_merges;
std::map<std::string, int> special_tokens;
id special_bos_id = 0; id special_bos_id = -1;
id special_eos_id = 0; id special_eos_id = -1;
id special_unk_id = 0; id special_unk_id = -1;
id special_sep_id = 0; id special_sep_id = -1;
id special_pad_id = 0; id special_pad_id = -1;
bool special_have_bos = false; id linefeed_id = -1;
bool special_have_eos = false;
bool special_have_unk = false;
bool special_have_sep = false;
bool special_have_pad = false;
std::unordered_map<token, id> token_to_id; std::unordered_map<token, id> token_to_id;
std::unordered_map<id, token> id_to_token; std::unordered_map<id, token> id_to_token;
@ -480,22 +476,6 @@ struct gpt2bpe_vocab {
bpe_ranks.emplace(bpe_merges_[i], i); bpe_ranks.emplace(bpe_merges_[i], i);
} }
bpe_merges = bpe_merges_; bpe_merges = bpe_merges_;
// populate special tokens too (0-11 and if available 65024++)
#if 0
for (int i = 0; i < 12; i++) {
special_tokens[id_to_token[i].tok] = i;
}
for (int i = 65024; i < (int)id_to_token.size(); i++) {
special_tokens[id_to_token[i].tok] = i;
}
#endif
// token_to_id["</s>"] = 11; // bugfix for TII instruct training (blocks stopwords)
// special_tokens["</s>"] = 11; // bugfix for TII instruct training (blocks stopwords)
return bpe_merges_.size(); return bpe_merges_.size();
} }
@ -512,10 +492,6 @@ struct gpt2bpe_vocab {
}).base(), str.end()); }).base(), str.end());
} }
// removed, merges loaded from gguf model file:
// requires the standard HF type tokenizer.json (pretty printed)
// std::vector<std::pair<std::string, std::string>> parse_json_to_bpe_merges(const std::string& filename) {
// get max token length available for a prefix of 2 bytes (string at least 2 bytes long) // get max token length available for a prefix of 2 bytes (string at least 2 bytes long)
int get_max_token_length(const std::string& string) const { int get_max_token_length(const std::string& string) const {
if (string.size() < 2) if (string.size() < 2)
@ -613,27 +589,9 @@ struct gpt2bpe_tokenizer {
{ {
work_queue_ = ggllm_bpe_bigram::queue(); work_queue_ = ggllm_bpe_bigram::queue();
symbols_.clear(); symbols_.clear();
bool is_special = false;
for (auto it = vocab_.special_tokens.begin(); it != vocab_.special_tokens.end(); ++it)
{
std::string special_token = it->first;
if (word.compare(special_token) == 0)
{
ggllm_bpe_symbol sym;
sym.text = word.c_str();
sym.n = word.size();
sym.prev = -1;
sym.next = -1;
symbols_.emplace_back(sym);
is_special = true;
break;
}
}
int index = 0; int index = 0;
size_t offset = 0; size_t offset = 0;
if (!is_special)
{
while (offset < word.size()) while (offset < word.size())
{ {
@ -651,7 +609,7 @@ struct gpt2bpe_tokenizer {
for (size_t i = 1; i < symbols_.size(); ++i) { for (size_t i = 1; i < symbols_.size(); ++i) {
add_new_bigram(i - 1, i); add_new_bigram(i - 1, i);
} }
}
// build token(s) // build token(s)
while (!work_queue_.empty()) while (!work_queue_.empty())
{ {
@ -794,17 +752,6 @@ private:
bpe_encoded_words.reserve(text.size()); bpe_encoded_words.reserve(text.size());
text_utf = CNCTUnicode::split_utf8_enhanced(text); text_utf = CNCTUnicode::split_utf8_enhanced(text);
std::map<std::string, int> special_tokens = vocab_.special_tokens;
int smallest_len_special_tokens = 0;
if (special_tokens.size())
{
smallest_len_special_tokens = special_tokens.begin()->first.size();
for (auto it = special_tokens.begin(); it != special_tokens.end(); ++it)
{
if (it->first.size() < (size_t)smallest_len_special_tokens)
smallest_len_special_tokens = it->first.size();
}
}
for (int i = 0; i < (int)text_utf.size(); i++) for (int i = 0; i < (int)text_utf.size(); i++)
{ {
@ -817,41 +764,6 @@ private:
const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString(); const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString();
// const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString(); // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString();
// handling special tokens
bool special_token_found = false;
if (bytes_remain >= (int)smallest_len_special_tokens)
for (auto it = special_tokens.begin(); it != special_tokens.end(); ++it)
{
if ((bytes_remain) < (int)it->first.size())
continue;
if (str_is_equal(text_pos, it->first.c_str(), it->first.size()))
{
if (token.size())
{
bpe_words.emplace_back(token); // push previous content as token
token.clear();
collecting = false;
collecting_letter = false;
collecting_numeric = false;
collecting_special = false;
collecting_whitespace_lookahead = false;
}
bpe_words.emplace_back(it->first); // push special token as token
// we now advance i until the token is fulfilled by the utf_chars
int st_bytes = (int)it->first.size();
for (;st_bytes;st_bytes -= text_utf[i++].str.size());
i--;
special_token_found = true;
break;
}
}
if (special_token_found) continue;
// handling contractions // handling contractions
if (!split_condition && bytes_remain >= 2) if (!split_condition && bytes_remain >= 2)
{ {
@ -1002,7 +914,7 @@ static std::vector<gpt2bpe_vocab::id> gpt2bpe_tokenize(const gpt2bpe_vocab & voc
return output; return output;
} }
if (bos && vocab.special_have_bos) { if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id); output.push_back(vocab.special_bos_id);
} }

File diff suppressed because it is too large Load diff

View file

@ -391,6 +391,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
{ {
int keyidx; int keyidx;
// check model architecture kv
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
@ -492,6 +493,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word; vocab.id_to_token[i] = word;
if( vocab.id_to_token[i] == "\n" ) {
vocab.linefeed_id = i;
}
} }
std::vector<std::pair<std::string, std::string>> bpe_merges; std::vector<std::pair<std::string, std::string>> bpe_merges;
@ -514,17 +518,18 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
vocab.populate_bpe_ranks(bpe_merges); vocab.populate_bpe_ranks(bpe_merges);
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); vocab.special_have_bos=true; } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); vocab.special_have_eos=true; } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); vocab.special_have_unk=true; } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); vocab.special_have_sep=true; } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); vocab.special_have_pad=true; } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_have_bos ) { fprintf(stdout, "%s: bos token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_have_eos ) { fprintf(stdout, "%s: eos token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_have_unk ) { fprintf(stdout, "%s: unk token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_have_sep ) { fprintf(stdout, "%s: sep token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_have_pad ) { fprintf(stdout, "%s: pad token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -656,6 +661,7 @@ ggml_tensor * gpt_neox_ff(
const gpt_neox_block &block, const gpt_neox_block &block,
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * inp) { ggml_tensor * inp) {
ggml_tensor * cur = ggml_norm(ctx0, inp); ggml_tensor * cur = ggml_norm(ctx0, inp);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
@ -1053,7 +1059,7 @@ int main(int argc, char ** argv) {
fflush(stdout); fflush(stdout);
// end of text token // end of text token
if (vocab.special_have_eos && embd.back() == vocab.special_eos_id) { if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
break; break;
} }
} }

View file

@ -0,0 +1,8 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,969 @@
#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstring>
#include <ctime>
#include <iterator>
#include <map>
#include <numeric>
#include <regex>
#include <sstream>
#include <stdio.h>
#include <string>
#include <vector>
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "build-info.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
// utils
static uint64_t get_time_ns() {
using clock = std::chrono::high_resolution_clock;
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}
template<class T>
static std::string join(const std::vector<T> & values, const std::string & delim) {
std::ostringstream str;
for (size_t i = 0; i < values.size(); i++) {
str << values[i];
if (i < values.size() - 1) {
str << delim;
}
}
return str.str();
}
template<class T>
static std::vector<T> split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}
template<typename T>
static T avg(const std::vector<T> & v) {
if (v.empty()) {
return 0;
}
T sum = std::accumulate(v.begin(), v.end(), T(0));
return sum / (T)v.size();
}
template<typename T>
static T stdev(const std::vector<T> & v) {
if (v.size() <= 1) {
return 0;
}
T mean = avg(v);
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
return stdev;
}
static bool ggml_cpu_has_metal() {
#if defined(GGML_USE_METAL)
return true;
#else
return false;
#endif
}
static std::string get_cpu_info() {
std::string id;
#ifdef __linux__
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
id = p;
break;
}
}
}
}
#endif
// TODO: other platforms
return id;
}
static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUBLAS
int count = ggml_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
}
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<int> n_batch;
std::vector<bool> f32_kv;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<int> main_gpu;
std::vector<bool> mul_mat_q;
std::vector<bool> low_vram;
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
int reps;
bool verbose;
output_formats output_format;
};
static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.bin"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_batch */ {512},
/* f32_kv */ {false},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* main_gpu */ {0},
/* mul_mat_q */ {true},
/* low_vram */ {false},
/* tensor_split */ {{}},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
};
static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : "md");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by repeating the parameter.\n");
}
static cmd_params parse_cmd_params(int argc, char ** argv) {
cmd_params params;
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
const char split_delim = ',';
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-h" || arg == "--help") {
print_usage(argc, argv);
exit(0);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "--memory-f32") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = split<int>(argv[i], split_delim);
} else if (arg == "-lv" || arg == "--low-vram") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
for (auto ts : split<std::string>(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
tensor_split[i] = std::stof(split_arg[i]);
} else {
tensor_split[i] = 0.0f;
}
}
params.tensor_split.push_back(tensor_split);
}
} else if (arg == "-r" || arg == "--repetitions") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.reps = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
if (argv[i] == std::string("csv")) {
params.output_format = CSV;
} else if (argv[i] == std::string("json")) {
params.output_format = JSON;
} else if (argv[i] == std::string("md")) {
params.output_format = MARKDOWN;
} else if (argv[i] == std::string("sql")) {
params.output_format = SQL;
} else {
invalid_param = true;
break;
}
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
invalid_param = true;
break;
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv);
exit(1);
}
// set defaults
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
}
struct cmd_params_instance {
std::string model;
int n_prompt;
int n_gen;
int n_batch;
bool f32_kv;
int n_threads;
int n_gpu_layers;
int main_gpu;
bool mul_mat_q;
bool low_vram;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
llama_context_params to_llama_params() const {
llama_context_params lparams = llama_context_default_params();
lparams.n_ctx = n_prompt + n_gen;
lparams.n_batch = n_batch;
lparams.f16_kv = !f32_kv;
lparams.n_gpu_layers = n_gpu_layers;
lparams.main_gpu = main_gpu;
lparams.mul_mat_q = mul_mat_q;
lparams.low_vram = low_vram;
lparams.tensor_split = tensor_split.data();
return lparams;
}
};
static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
std::vector<cmd_params_instance> instances;
for (const auto & m : params.model)
for (const auto & nb : params.n_batch)
for (const auto & fk : params.f32_kv)
for (const auto & nl : params.n_gpu_layers)
for (const auto & mg : params.main_gpu)
for (const auto & mmq : params.mul_mat_q)
for (const auto & lv : params.low_vram)
for (const auto & ts : params.tensor_split)
for (const auto & nt : params.n_threads) {
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .f32_kv = */ fk,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg,
/* .mul_mat_q = */ mmq,
/* .low_vram = */ lv,
/* .tensor_split = */ ts,
};
instances.push_back(instance);
}
return instances;
}
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
std::vector<cmd_params_instance> instances;
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
}
auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
}
for (const auto & n_gen : params.n_gen) {
if (n_gen == 0) {
continue;
}
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
}
return instances;
}
struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
static const bool opencl;
static const bool metal;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
static const std::string gpu_info;
std::string model_filename;
std::string model_type;
int n_batch;
int n_threads;
bool f32_kv;
int n_gpu_layers;
int main_gpu;
bool mul_mat_q;
bool low_vram;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
int n_prompt;
int n_gen;
std::string test_time;
std::vector<uint64_t> samples_ns;
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
model_filename = inst.model;
char buf[128];
llama_model_type(lmodel, buf, sizeof(buf));
model_type = buf;
n_batch = inst.n_batch;
n_threads = inst.n_threads;
f32_kv = inst.f32_kv;
n_gpu_layers = inst.n_gpu_layers;
main_gpu = inst.main_gpu;
mul_mat_q = inst.mul_mat_q;
low_vram = inst.low_vram;
tensor_split = inst.tensor_split;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
time_t t = time(NULL);
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
test_time = buf;
(void) ctx;
}
uint64_t avg_ns() const {
return ::avg(samples_ns);
}
uint64_t stdev_ns() const {
return ::stdev(samples_ns);
}
std::vector<double> get_ts() const {
int n_tokens = n_prompt + n_gen;
std::vector<double> ts;
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
return ts;
}
double avg_ts() const {
return ::avg(get_ts());
}
double stdev_ts() const {
return ::stdev(get_ts());
}
static std::string get_backend() {
if (cuda) {
return "CUDA";
}
if (opencl) {
return "OpenCL";
}
if (metal) {
return "Metal";
}
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
}
return "CPU";
}
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type",
"n_batch", "n_threads", "f16_kv",
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
};
return fields;
}
enum field_type {STRING, BOOL, INT, FLOAT};
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
return FLOAT;
}
return STRING;
}
std::vector<std::string> get_values() const {
std::string tensor_split_str;
int max_nonzero = 0;
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
if (tensor_split[i] > 0) {
max_nonzero = i;
}
}
for (int i = 0; i <= max_nonzero; i++) {
char buf[32];
snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
tensor_split_str += buf;
if (i < max_nonzero) {
tensor_split_str += "/";
}
}
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type,
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
};
return values;
}
std::map<std::string, std::string> get_map() const {
std::map<std::string, std::string> map;
auto fields = get_fields();
auto values = get_values();
std::transform(fields.begin(), fields.end(), values.begin(),
std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
return map;
}
};
const std::string test::build_commit = BUILD_COMMIT;
const int test::build_number = BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
struct printer {
virtual ~printer() {}
FILE * fout;
virtual void print_header(const cmd_params & params) { (void) params; };
virtual void print_test(const test & t) = 0;
virtual void print_footer() { };
};
struct csv_printer : public printer {
static std::string escape_csv(const std::string & field) {
std::string escaped = "\"";
for (auto c : field) {
if (c == '"') {
escaped += "\"";
}
escaped += c;
}
escaped += "\"";
return escaped;
}
void print_header(const cmd_params & params) override {
std::vector<std::string> fields = test::get_fields();
fprintf(fout, "%s\n", join(fields, ",").c_str());
(void) params;
}
void print_test(const test & t) override {
std::vector<std::string> values = t.get_values();
std::transform(values.begin(), values.end(), values.begin(), escape_csv);
fprintf(fout, "%s\n", join(values, ",").c_str());
}
};
struct json_printer : public printer {
bool first = true;
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
void print_header(const cmd_params & params) override {
fprintf(fout, "[\n");
(void) params;
}
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
if (first) {
first = false;
} else {
fprintf(fout, ",\n");
}
fprintf(fout, " {\n");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
fprintf(fout, " }");
fflush(fout);
}
void print_footer() override {
fprintf(fout, "\n]\n");
}
};
struct markdown_printer : public printer {
std::vector<std::string> fields;
static int get_field_width(const std::string & field) {
if (field == "model") {
return -30;
}
if (field == "t/s") {
return 15;
}
int width = std::max((int)field.length(), 10);
if (test::get_field_type(field) == test::STRING) {
return -width;
}
return width;
}
void print_header(const cmd_params & params) override {
// select fields to print
fields = { "model", "backend" };
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
if (!is_cpu_backend) {
fields.push_back("n_gpu_layers");
}
if (params.n_batch.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.push_back("n_threads");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.push_back("n_batch");
}
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
fields.push_back("f16_kv");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
}
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q");
}
if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
fields.push_back("low_vram");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split");
}
fields.push_back("test");
fields.push_back("t/s");
fprintf(fout, "|");
for (const auto & field : fields) {
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
}
fprintf(fout, "\n");
fprintf(fout, "|");
for (const auto & field : fields) {
int width = get_field_width(field);
fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
}
fprintf(fout, "\n");
}
void print_test(const test & t) override {
std::map<std::string, std::string> vmap = t.get_map();
fprintf(fout, "|");
for (const auto & field : fields) {
std::string value;
if (field == "model") {
value = t.model_type;
} else if (field == "backend") {
value = test::get_backend();
} else if (field == "test") {
char buf[128];
if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) {
snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
} else {
assert(false);
exit(1);
}
value = buf;
} else if (field == "t/s") {
char buf[128];
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
value = buf;
} else if (vmap.find(field) != vmap.end()) {
value = vmap.at(field);
} else {
assert(false);
exit(1);
}
int width = get_field_width(field);
if (field == "t/s") {
// HACK: the utf-8 character is 2 bytes
width += 1;
}
fprintf(fout, " %*s |", width, value.c_str());
}
fprintf(fout, "\n");
}
void print_footer() override {
fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
}
};
struct sql_printer : public printer {
static std::string get_sql_field_type(const std::string & field) {
switch (test::get_field_type(field)) {
case test::STRING:
return "TEXT";
case test::BOOL:
case test::INT:
return "INTEGER";
case test::FLOAT:
return "REAL";
default:
assert(false);
exit(1);
}
}
void print_header(const cmd_params & params) override {
std::vector<std::string> fields = test::get_fields();
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
}
fprintf(fout, ");\n");
fprintf(fout, "\n");
(void) params;
}
void print_test(const test & t) override {
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
fprintf(fout, "VALUES (");
std::vector<std::string> values = t.get_values();
for (size_t i = 0; i < values.size(); i++) {
fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
}
fprintf(fout, ");\n");
}
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
int n_processed = 0;
while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch);
llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
n_processed += n_tokens;
}
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(ctx);
for (int i = 0; i < n_gen; i++) {
llama_eval(ctx, &token, 1, n_past + i, n_threads);
}
}
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
(void) level;
(void) text;
(void) user_data;
}
int main(int argc, char ** argv) {
#if !defined(NDEBUG)
fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
#endif
#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
fprintf(stderr, "warning: debug build, performance may be affected\n");
#endif
#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
#endif
cmd_params params = parse_cmd_params(argc, argv);
// initialize llama.cpp
if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL);
}
bool numa = false;
llama_backend_init(numa);
// initialize printer
std::unique_ptr<printer> p;
switch (params.output_format) {
case CSV:
p.reset(new csv_printer());
break;
case JSON:
p.reset(new json_printer());
break;
case MARKDOWN:
p.reset(new markdown_printer());
break;
case SQL:
p.reset(new sql_printer());
break;
default:
assert(false);
exit(1);
}
p->fout = stdout;
p->print_header(params);
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
for (const auto & inst : params_instances) {
// TODO: keep the model between tests when possible
llama_context_params lparams = inst.to_llama_params();
llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams);
if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
return 1;
}
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
llama_free_model(lmodel);
return 1;
}
test t(inst, lmodel, ctx);
// warmup run
test_gen(ctx, 1, 0, t.n_threads);
for (int i = 0; i < params.reps; i++) {
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
}
uint64_t t_ns = get_time_ns() - t_start;
t.samples_ns.push_back(t_ns);
}
p->print_test(t);
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(lmodel);
}
p->print_footer();
llama_backend_free();
return 0;
}

View file

@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
{ {
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos()); const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
} }
@ -345,10 +345,9 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
{ {
auto it = params.logit_bias.find(llama_token_eos()); auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr, fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
"%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
} }
} }
@ -398,7 +397,7 @@ int main(int argc, char ** argv) {
// do one empty run to warm up the model // do one empty run to warm up the model
{ {
const std::vector<llama_token> tmp = { llama_token_bos(), }; const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx); llama_reset_timings(ctx);
} }
@ -582,7 +581,7 @@ int main(int argc, char ** argv) {
} }
// Apply penalties // Apply penalties
float nl_logit = logits[llama_token_nl()]; float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p, llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -591,7 +590,7 @@ int main(int argc, char ** argv) {
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence); last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) { if (!penalize_nl) {
logits[llama_token_nl()] = nl_logit; logits[llama_token_nl(ctx)] = nl_logit;
} }
if (grammar != NULL) { if (grammar != NULL) {
@ -697,7 +696,7 @@ int main(int argc, char ** argv) {
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos()) { if (last_n_tokens.back() == llama_token_eos(ctx)) {
if (params.interactive) { if (params.interactive) {
if (params.antiprompt.size() != 0) { if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
@ -721,7 +720,7 @@ int main(int argc, char ** argv) {
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
embd_inp.push_back(llama_token_bos()); embd_inp.push_back(llama_token_bos(ctx));
} }
std::string buffer; std::string buffer;
@ -786,7 +785,7 @@ int main(int argc, char ** argv) {
} }
// end of text token // end of text token
if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) { if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
fprintf(stderr, " [end of text]\n"); fprintf(stderr, " [end of text]\n");
break; break;
} }

View file

@ -2,7 +2,7 @@
// //
// - First, export a LLaMA graph: // - First, export a LLaMA graph:
// //
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export // $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
// //
// - Run this tool to evaluate the exported graph: // - Run this tool to evaluate the exported graph:
// //

View file

@ -63,7 +63,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (j == 0) { if (j == 0) {
tokens[batch_start] = llama_token_bos(); tokens[batch_start] = llama_token_bos(ctx);
} }
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@ -88,7 +88,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%d minutes\n", total_seconds / 60); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
// We get the logits for all the tokens in the context window (params.n_ctx) // We get the logits for all the tokens in the context window (params.n_ctx)

View file

@ -1,7 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "build-info.h" #include "build-info.h"
#define LLAMA_API_CPP // TODO: eliminate me
#define LLAMA_API_INTERNAL #define LLAMA_API_INTERNAL
#include "llama.h" #include "llama.h"
@ -25,7 +24,7 @@
#endif #endif
struct quantize_stats_params { struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin"; std::string model = "models/7B/ggml-model-f16.gguf";
bool verbose = false; bool verbose = false;
bool per_layer_stats = false; bool per_layer_stats = false;
bool print_histogram = false; bool print_histogram = false;

View file

@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
Command line options: Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during computation. - `--threads N`, `-t N`: Set the number of threads to use during computation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -48,14 +48,12 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.): ### Unix-based systems (Linux, macOS, etc.):
```bash ```bash
./server -m models/7B/ggml-model.bin -c 2048 ./server -m models/7B/ggml-model.gguf -c 2048
``` ```
### Windows: ### Windows:
```powershell ```powershell
server.exe -m models\7B\ggml-model.bin -c 2048
```
The above command will start a server that by default listens on `127.0.0.1:8080`. The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

File diff suppressed because it is too large Load diff

View file

@ -170,6 +170,136 @@
grammar: '', grammar: '',
}) })
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
const local_storage_storageKey = "llamacpp_server_local_storage";
function local_storage_setDataFromObject(tag, content) {
localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
}
function local_storage_setDataFromRawText(tag, content) {
localStorage.setItem(local_storage_storageKey + '/' + tag, content);
}
function local_storage_getDataAsObject(tag) {
const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
if (!item) {
return null;
} else {
return JSON.parse(item);
}
}
function local_storage_getDataAsRawText(tag) {
const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
if (!item) {
return null;
} else {
return item;
}
}
// create a container for user templates and settings
const savedUserTemplates = signal({})
const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
// let's import locally saved templates and settings if there are any
// user templates and settings are stored in one object
// in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
console.log('Importing saved templates')
let importedTemplates = local_storage_getDataAsObject('user_templates')
if (importedTemplates) {
// saved templates were successfuly imported.
console.log('Processing saved templates and updating default template')
//console.log(importedTemplates);
savedUserTemplates.value = importedTemplates;
//override default template
savedUserTemplates.value.default = { session: session.value, params: params.value }
local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
} else {
// no saved templates detected.
console.log('Initializing LocalStorage and saving default template')
savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
}
function userTemplateResetToDefault() {
console.log('Reseting themplate to default')
selectedUserTemplate.value.name = 'default';
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
}
function userTemplateApply(t) {
session.value = t.data.session;
params.value = t.data.params;
}
function userTemplateResetToDefaultAndApply() {
userTemplateResetToDefault()
userTemplateApply(selectedUserTemplate.value)
}
function userTemplateLoadAndApplyAutosaved() {
// get autosaved last used template
let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
if (lastUsedTemplate) {
console.log('Autosaved template found, restoring')
selectedUserTemplate.value = lastUsedTemplate
}
else {
console.log('No autosaved template found, using default template')
// no autosaved last used template was found, so load from default.
userTemplateResetToDefault()
}
console.log('Applying template')
// and update internal data from templates
userTemplateApply(selectedUserTemplate.value)
}
//console.log(savedUserTemplates.value)
//console.log(selectedUserTemplate.value)
function userTemplateAutosave() {
console.log('Template Autosave...')
if (selectedUserTemplate.value.name == 'default') {
// we don't want to save over default template, so let's create a new one
let newTemplateName = 'UserTemplate-' + Date.now().toString()
let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
console.log('Saving as ' + newTemplateName)
// save in the autosave slot
local_storage_setDataFromObject('user_templates_last', newTemplate)
// and load it back and apply
userTemplateLoadAndApplyAutosaved()
} else {
local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
}
}
console.log('Checking for autosaved last used template')
userTemplateLoadAndApplyAutosaved()
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
const llamaStats = signal(null) const llamaStats = signal(null)
const controller = signal(null) const controller = signal(null)
@ -346,8 +476,34 @@
` `
}; };
const userTemplateReset = (e) => {
e.preventDefault();
userTemplateResetToDefaultAndApply()
}
const UserTemplateResetButton = () => {
if (selectedUserTemplate.value.name == 'default') {
return html`
<button disabled>Using default template</button>
`
}
return html`
<button onclick=${userTemplateReset}>Reset all to default</button>
`
};
useEffect(() => {
// autosave template on every change
userTemplateAutosave()
}, [session.value, params.value])
return html` return html`
<form> <form>
<fieldset>
<${UserTemplateResetButton}/>
</fieldset>
<fieldset> <fieldset>
<div> <div>
<label for="prompt">Prompt</label> <label for="prompt">Prompt</label>

View file

@ -15,6 +15,7 @@
#include "index.html.hpp" #include "index.html.hpp"
#include "index.js.hpp" #include "index.js.hpp"
#include "completion.js.hpp" #include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
#ifndef SERVER_VERBOSE #ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1 #define SERVER_VERBOSE 1
@ -278,7 +279,7 @@ struct llama_server_context
grammar_parser::print_grammar(stderr, parsed_grammar); grammar_parser::print_grammar(stderr, parsed_grammar);
{ {
auto it = params.logit_bias.find(llama_token_eos()); auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != params.logit_bias.end() && it->second == -INFINITY) {
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
} }
@ -401,7 +402,7 @@ struct llama_server_context
if (params.n_predict == 0) if (params.n_predict == 0)
{ {
has_next_token = false; has_next_token = false;
result.tok = llama_token_eos(); result.tok = llama_token_eos(ctx);
return result; return result;
} }
@ -441,7 +442,7 @@ struct llama_server_context
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
// Apply penalties // Apply penalties
float nl_logit = logits[llama_token_nl()]; float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p, llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -451,7 +452,7 @@ struct llama_server_context
last_n_repeat, alpha_frequency, alpha_presence); last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) if (!penalize_nl)
{ {
logits[llama_token_nl()] = nl_logit; logits[llama_token_nl(ctx)] = nl_logit;
} }
if (grammar != nullptr) { if (grammar != nullptr) {
@ -514,7 +515,7 @@ struct llama_server_context
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
if (!embd.empty() && embd.back() == llama_token_eos()) if (!embd.empty() && embd.back() == llama_token_eos(ctx))
{ {
// stopping_word = llama_token_to_str(ctx, embd.back()); // stopping_word = llama_token_to_str(ctx, embd.back());
has_next_token = false; has_next_token = false;
@ -948,7 +949,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
static json format_generation_settings(llama_server_context &llama) static json format_generation_settings(llama_server_context &llama)
{ {
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second); eos_bias->second < 0.0f && std::isinf(eos_bias->second);
@ -1083,7 +1084,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
llama.params.logit_bias.clear(); llama.params.logit_bias.clear();
if (body.value("ignore_eos", false)) if (body.value("ignore_eos", false))
{ {
llama.params.logit_bias[llama_token_eos()] = -INFINITY; llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
} }
const auto &logit_bias = body.find("logit_bias"); const auto &logit_bias = body.find("logit_bias");
@ -1199,6 +1200,12 @@ int main(int argc, char **argv)
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
return false; }); return false; });
// this is only called if no index.html is found in the public --path
svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
{
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
return false; });
svr.Post("/completion", [&llama](const Request &req, Response &res) svr.Post("/completion", [&llama](const Request &req, Response &res)
{ {
auto lock = llama.lock(); auto lock = llama.lock();

View file

@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
new_token_id = llama_sample_token_greedy(ctx , &candidates_p); new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ? // is it an end of stream ?
if (new_token_id == llama_token_eos()) { if (new_token_id == llama_token_eos(ctx)) {
fprintf(stderr, " [end of text]\n"); fprintf(stderr, " [end of text]\n");
break; break;
} }

View file

@ -1996,7 +1996,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
} }
} }
void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = target_logits->ne[0]; int n_vocab = target_logits->ne[0];
@ -2005,7 +2005,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_logits, -1.0f/n_vocab);
ggml_set_f32(target_probs, 0.0f); ggml_set_f32(target_probs, 0.0f);
ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
for (int i=1; i<n_tokens+1; ++i) { for (int i=1; i<n_tokens+1; ++i) {
int token = clamp(train_data[sample+i-1], 0, n_vocab-1); int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
set_f32_2d(target_logits, token, i-1, +1.0f); set_f32_2d(target_logits, token, i-1, +1.0f);
@ -2016,7 +2016,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
} }
} }
void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_logits->n_dims == 3);
GGML_ASSERT(target_probs->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3);
@ -2036,7 +2036,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples]; size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
GGML_ASSERT(sample+n_tokens-1 < n_train_data); GGML_ASSERT(sample+n_tokens-1 < n_train_data);
set_i32_2d(tokens_input, 0, k, llama_token_bos()); set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
for (int i=1; i<n_tokens+1; ++i) { for (int i=1; i<n_tokens+1; ++i) {
int token = clamp(train_data[sample+i-1], 0, n_vocab-1); int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
// print_token(lctx, token); // print_token(lctx, token);
@ -2294,7 +2294,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
const auto params = sampler->params; const auto params = sampler->params;
// Apply penalties // Apply penalties
const float nl_logit = logits[llama_token_nl()]; const float nl_logit = logits[llama_token_nl(ctx)];
const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
@ -2313,7 +2313,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
params.alpha_presence); params.alpha_presence);
if (!params.penalize_nl) { if (!params.penalize_nl) {
logits[llama_token_nl()] = nl_logit; logits[llama_token_nl(ctx)] = nl_logit;
} }
llama_token token = 0; llama_token token = 0;
@ -3181,7 +3181,7 @@ int main(int argc, char ** argv) {
std::vector<int> train_samples; std::vector<int> train_samples;
train_samples.push_back(0); train_samples.push_back(0);
for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) { if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
train_samples.push_back(i); train_samples.push_back(i);
} }
} }
@ -3341,7 +3341,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
for (int i=sample_ctx; i<n_tokens; ++i) { for (int i=sample_ctx; i<n_tokens; ++i) {
ggml_set_i32_1d(tokens_input, i, n_vocab/2); ggml_set_i32_1d(tokens_input, i, n_vocab/2);
} }

View file

@ -14,8 +14,6 @@
with pkgs.darwin.apple_sdk_11_0.frameworks; [ with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate Accelerate
MetalKit MetalKit
MetalPerformanceShaders
MetalPerformanceShadersGraph
] ]
else if isAarch32 && isDarwin then else if isAarch32 && isDarwin then
with pkgs.darwin.apple_sdk.frameworks; [ with pkgs.darwin.apple_sdk.frameworks; [

View file

@ -67,6 +67,8 @@ struct ggml_allocr {
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
size_t max_size; size_t max_size;
bool measure; bool measure;
int parse_seq[GGML_MAX_NODES];
bool has_parse_seq;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024]; struct ggml_tensor * allocated_tensors[1024];
@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
size_t max_avail = 0; size_t max_avail = 0;
// find the best fitting free block // find the best fitting free block besides the last block
int best_fit_block = -1; int best_fit_block = -1;
size_t best_fit_size = SIZE_MAX; size_t best_fit_size = SIZE_MAX;
for (int i = 0; i < alloc->n_free_blocks; i++) { for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
struct free_block * block = &alloc->free_blocks[i]; struct free_block * block = &alloc->free_blocks[i];
max_avail = MAX(max_avail, block->size); max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) { if (block->size >= size && block->size <= best_fit_size) {
@ -126,11 +128,18 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
AT_PRINTF("block %d\n", best_fit_block); AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) { if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return; return;
} }
}
struct free_block * block = &alloc->free_blocks[best_fit_block]; struct free_block * block = &alloc->free_blocks[best_fit_block];
void * addr = block->addr; void * addr = block->addr;
block->addr = (char*)block->addr + size; block->addr = (char*)block->addr + size;
@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
alloc->n_free_blocks++; alloc->n_free_blocks++;
} }
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
int pos = 0;
for (int i = 0; i < n; i++) {
if (list[i] != -1) {
alloc->parse_seq[pos] = list[i];
pos++;
}
}
alloc->has_parse_seq = true;
}
void ggml_allocr_reset(struct ggml_allocr * alloc) { void ggml_allocr_reset(struct ggml_allocr * alloc) {
alloc->n_free_blocks = 1; alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
/*.hash_table = */ {{0}}, /*.hash_table = */ {{0}},
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ false, /*.measure = */ false,
/*.parse_seq = */ {0},
/*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
/*.hash_table = */ {{0}}, /*.hash_table = */ {{0}},
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ true, /*.measure = */ true,
/*.parse_seq = */ {0},
/*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -473,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
allocate_node(alloc, input); allocate_node(alloc, input);
} }
} }
for (int i = 0; i < gf->n_nodes; i++) { for (int ind = 0; ind < gf->n_nodes; ind++) {
int i;
if (alloc->has_parse_seq) {
i = alloc->parse_seq[ind];
} else {
i = ind;
}
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
// allocate parents (leafs) // allocate parents (leafs)

View file

@ -10,6 +10,10 @@ extern "C" {
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);

View file

@ -6469,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
func(tensor->src[0], tensor->src[1], tensor); func(tensor->src[0], tensor->src[1], tensor);
return true; return true;
} }
int ggml_cuda_get_device_count() {
int device_count;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
return device_count;
}
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
snprintf(description, description_size, "%s", prop.name);
}

View file

@ -8,29 +8,25 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
void ggml_init_cublas(void); GGML_API void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split); GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_set_main_device(int main_device);
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
GGML_API void ggml_cuda_free_scratch(void);
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
// TODO: export these with GGML_API GGML_API int ggml_cuda_get_device_count(void);
void * ggml_cuda_host_malloc(size_t size); GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
void ggml_cuda_host_free(void * ptr);
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_free_data(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -66,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
// try to find operations that can be run concurrently in the graph // try to find operations that can be run concurrently in the graph
// you should run it again if the topology of your graph changes // you should run it again if the topology of your graph changes
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
// if the graph has been optimized for concurrently dispatch // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
// output the concur_list for ggml_alloc
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel // creates gf->n_threads command buffers in parallel

View file

@ -5,7 +5,6 @@
#import <Foundation/Foundation.h> #import <Foundation/Foundation.h>
#import <Metal/Metal.h> #import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#undef MIN #undef MIN
#undef MAX #undef MAX
@ -79,6 +78,14 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@ -110,13 +117,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
// determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) {
fprintf(stderr, "%s: using MPS\n", __func__);
} else {
fprintf(stderr, "%s: not using MPS\n", __func__);
GGML_ASSERT(false && "MPS not supported");
}
#if 0 #if 0
// compile from source string and show compile log // compile from source string and show compile log
@ -163,10 +163,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
// load kernels // load kernels
{ {
NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
if (error) { \
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \
}
GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add);
GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(add_row);
@ -196,6 +201,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@ -243,11 +256,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb; ctx->n_cb = n_cb;
} }
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
if (ctx->concur_list_len) { return ctx->concur_list_len;
return true; }
}
return false; int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
return ctx->concur_list;
} }
// finds the Metal buffer that contains the tensor data on the GPU device // finds the Metal buffer that contains the tensor data on the GPU device
@ -390,7 +404,7 @@ void ggml_metal_get_tensor(
void ggml_metal_graph_find_concurrency( void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf, bool check_mem) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_CONCUR]; int nodes_unused[GGML_MAX_CONCUR];
@ -437,7 +451,7 @@ void ggml_metal_graph_find_concurrency(
} }
} }
} }
if (exe_flag) { if (exe_flag && check_mem) {
// check if nodes[i]'s data will be overwritten by a node before nodes[i]. // check if nodes[i]'s data will be overwritten by a node before nodes[i].
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
int64_t data_start = (int64_t) gf->nodes[i]->data; int64_t data_start = (int64_t) gf->nodes[i]->data;
@ -521,7 +535,7 @@ void ggml_metal_graph_compute(
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx]; id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = nil; id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
@ -530,10 +544,6 @@ void ggml_metal_graph_compute(
const int i = has_concur ? ctx->concur_list[ind] : ind; const int i = has_concur ? ctx->concur_list[ind] : ind;
if (i == -1) { if (i == -1) {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
continue;
}
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
continue; continue;
} }
@ -607,10 +617,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ADD: case GGML_OP_ADD:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row]; [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -628,10 +634,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_MUL: case GGML_OP_MUL:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_mul_row]; [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -649,10 +651,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float scale = *(const float *) src1->data; const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale]; [encoder setComputePipelineState:ctx->pipeline_scale];
@ -668,10 +666,6 @@ void ggml_metal_graph_compute(
switch (ggml_get_unary_op(gf->nodes[i])) { switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setComputePipelineState:ctx->pipeline_silu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -682,10 +676,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_relu]; [encoder setComputePipelineState:ctx->pipeline_relu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -696,10 +686,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_gelu]; [encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -716,10 +702,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32; const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max]; [encoder setComputePipelineState:ctx->pipeline_soft_max];
@ -734,10 +716,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *)(dst->op_params))[0]; const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
@ -755,53 +733,43 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
uint gqa = ne12/ne02;
GGML_ASSERT(ne03 == ne13); GGML_ASSERT(ne03 == ne13);
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (ggml_is_contiguous(src0) && if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ggml_is_contiguous(src1) &&
(src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
if (encoder != nil) { ne00%32 == 0 &&
[encoder endEncoding]; ne11 > 1) {
encoder = nil; switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
// for F32 x F32 we use MPS [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
MPSMatrixDescriptor * desc = [MPSMatrixDescriptor [encoder setThreadgroupMemoryLength:8192 atIndex:0];
matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2;
MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
[mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
} }
} else { else {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
@ -900,23 +868,24 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else #else
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif #endif
} }
else if (src0t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -925,10 +894,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
@ -954,10 +919,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
@ -977,10 +938,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_NORM: case GGML_OP_NORM:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float eps = 1e-5f; const float eps = 1e-5f;
const int nth = 256; const int nth = 256;
@ -999,10 +956,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ALIBI: case GGML_OP_ALIBI:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
GGML_ASSERT((src0t == GGML_TYPE_F32)); GGML_ASSERT((src0t == GGML_TYPE_F32));
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
@ -1042,10 +995,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
@ -1086,10 +1035,6 @@ void ggml_metal_graph_compute(
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32; const int nth = 32;
switch (src0t) { switch (src0t) {

File diff suppressed because it is too large Load diff

2
ggml.c
View file

@ -19290,7 +19290,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
} }
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
free(data); free(data);
} if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
GGML_ASSERT(false && "nested arrays not supported"); GGML_ASSERT(false && "nested arrays not supported");
} else { } else {
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);

2
ggml.h
View file

@ -214,7 +214,7 @@
#define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1 #define GGML_EXIT_ABORTED 1
#define GGUF_MAGIC 0x47475546 // "GGUF" #define GGUF_MAGIC 0x46554747 // "GGUF"
#define GGUF_VERSION 1 #define GGUF_VERSION 1
#define GGUF_DEFAULT_ALIGNMENT 32 #define GGUF_DEFAULT_ALIGNMENT 32

381
gguf.py
View file

@ -1,21 +1,17 @@
"""TODOs import shutil
1. Implement writers for known architectures, LLaMA in particular.
2. Add docstrings from the format specs.
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
"""
import sys import sys
import struct import struct
import tempfile
import numpy as np import numpy as np
from enum import IntEnum from enum import IntEnum, auto
from typing import Any, IO, List from typing import Any, IO, List
# #
# constants # constants
# #
GGUF_MAGIC = 0x47475546 GGUF_MAGIC = 0x46554747
GGUF_VERSION = 1 GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32 GGUF_DEFAULT_ALIGNMENT = 32
@ -27,30 +23,29 @@ KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author" KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url" KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url" KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM # LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length" KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length" KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count" KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length" KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual" KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout" KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
# attention # attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count" KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv" KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias" KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv" KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon" KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon" KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count" KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale" KEY_ROPE_SCALE = "{arch}.rope.scale"
# tokenization # tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
@ -70,91 +65,251 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
# recommended mapping of model tensor names for storage in gguf # recommended mapping of model tensor names for storage in gguf
# #
def get_tensor_name_map(n_blocks : int):
class MODEL_ARCH(IntEnum):
LLAMA = auto()
FALCON = auto()
GPT2 = auto()
GPTJ = auto()
GPTNEOX = auto()
MPT = auto()
class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto()
POS_EMBD = auto()
OUTPUT = auto()
OUTPUT_NORM = auto()
ROPE_FREQS = auto()
ATTN_Q = auto()
ATTN_K = auto()
ATTN_V = auto()
ATTN_QKV = auto()
ATTN_OUT = auto()
ATTN_NORM = auto()
ATTN_NORM_2 = auto()
ATTN_ROT_EMBD = auto()
FFN_GATE = auto()
FFN_DOWN = auto()
FFN_UP = auto()
FFN_NORM = auto()
MODEL_ARCH_NAMES = {
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
}
MODEL_TENSOR_NAMES = {
MODEL_ARCH.LLAMA: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPTNEOX: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.FALCON: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPT2: {
# TODO
},
# TODO
}
# tensors that will not be serialized
MODEL_TENSOR_SKIP = {
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
}
# TODO: the following helper functions should be removed
# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
# REMOVE
def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
for skip in MODEL_TENSOR_SKIP.get(arch, []):
for i in range(n_blocks):
if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
return True
return False
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
tensor_map = {} tensor_map = {}
# Token embeddings # Token embeddings
mapped_to = "token_embd" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings # Position embeddings
mapped_to = "pos_embd" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
tensor_map["transformer.wpe"] = mapped_to # gpt2 tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth
# Output norm # Output norm
mapped_to = "output_norm" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth tensor_map["norm"] = mapped_to # llama-pth
# Output
mapped_to = "output" # Rope frequencies
tensor_map["embed_out"] = mapped_to # gptneox mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth tensor_map["rope.freqs"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks
for i in range(0,n_blocks): # Attention and feed-forward blocks
for i in range(0, n_blocks):
# Attention norm # Attention norm
mapped_to = "blk."+str(i)+".attn_norm" # TODO: is there are simpler way to write these 2 lines in Python?
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
mapped_to = mapped_to.format(bid=i) if mapped_to else None
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2 # Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
# Attention query-key-value # Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query # Attention query
mapped_to = "blk."+str(i)+".attn_q" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key # Attention key
mapped_to = "blk."+str(i)+".attn_k" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value # Attention value
mapped_to = "blk."+str(i)+".attn_v" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output # Attention output
mapped_to = "blk."+str(i)+".attn_output" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Rotary embeddings
mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth
# Feed-forward norm # Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up # Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate # Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down # Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down" mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
@ -168,6 +323,7 @@ def get_tensor_name_map(n_blocks : int):
# implementation # implementation
# #
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1
@ -203,14 +359,16 @@ class GGUFValueType(IntEnum):
class GGUFWriter: class GGUFWriter:
def __init__(self, fout: IO): def __init__(self, path: str, arch: str):
self.fout = fout self.fout = open(path, "wb")
self.arch = arch
self.offset_tensor = 0 self.offset_tensor = 0
self.data_alignment = GGUF_DEFAULT_ALIGNMENT self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = b"" self.kv_data = b""
self.kv_data_count = 0 self.kv_data_count = 0
self.ti_data = b"" self.ti_data = b""
self.ti_data_count = 0 self.ti_data_count = 0
self.add_architecture()
def write_header_to_file(self): def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC)) self.fout.write(struct.pack("<I", GGUF_MAGIC))
@ -228,11 +386,6 @@ class GGUFWriter:
self.fout.write(self.ti_data) self.fout.write(self.ti_data)
self.flush() self.flush()
@classmethod
def open(cls, path: str) -> "GGUFWriter":
f = open(path, "wb")
return cls(f)
def add_key(self, key: str): def add_key(self, key: str):
self.add_val(key, GGUFValueType.STRING, add_vtype=False) self.add_val(key, GGUFValueType.STRING, add_vtype=False)
@ -269,7 +422,8 @@ class GGUFWriter:
self.add_val(val, GGUFValueType.BOOL) self.add_val(val, GGUFValueType.BOOL)
def add_string(self, key: str, val: str): def add_string(self, key: str, val: str):
if len(val) == 0: return if len(val) == 0:
return
self.add_key(key) self.add_key(key)
self.add_val(val, GGUFValueType.STRING) self.add_val(val, GGUFValueType.STRING)
@ -323,6 +477,8 @@ class GGUFWriter:
return ((x + n - 1) // n) * n return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int): def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<I", len(encoded_name)) self.ti_data += struct.pack("<I", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
@ -331,14 +487,26 @@ class GGUFWriter:
for i in range(n_dims): for i in range(n_dims):
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i]) self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
self.ti_data += struct.pack("<I", dtype) self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor) self.ti_data += struct.pack("<Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1 self.ti_data_count += 1
def write_tensor_to_file(self, tensor: np.ndarray): def add_tensor(self, name: str, tensor: np.ndarray):
if not hasattr(self, "temp_file"):
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
self.temp_file.seek(0)
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
tensor.tofile(self.temp_file)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if pad != 0:
self.temp_file.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray):
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) self.fout.write(bytes([0] * pad))
@ -349,21 +517,33 @@ class GGUFWriter:
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) self.fout.write(bytes([0] * pad))
def write_tensors_to_file(self):
self.write_ti_data_to_file()
pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
if pad != 0:
self.fout.write(bytes([0] * pad))
self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout)
self.flush()
self.temp_file.close()
def flush(self): def flush(self):
self.fout.flush() self.fout.flush()
def close(self): def close(self):
self.fout.close() self.fout.close()
def add_architecture(self, architecture: str): def add_architecture(self):
self.add_string(KEY_GENERAL_ARCHITECTURE, self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
architecture)
def add_author(self, author: str): def add_author(self, author: str):
self.add_string(KEY_GENERAL_AUTHOR, author) self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout) self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(KEY_GENERAL_URL, url) self.add_string(KEY_GENERAL_URL, url)
@ -371,9 +551,6 @@ class GGUFWriter:
def add_description(self, description: str): def add_description(self, description: str):
self.add_string(KEY_GENERAL_DESCRIPTION, description) self.add_string(KEY_GENERAL_DESCRIPTION, description)
def add_file_type(self, file_type: str):
self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
def add_source_url(self, url: str): def add_source_url(self, url: str):
self.add_string(KEY_GENERAL_SOURCE_URL, url) self.add_string(KEY_GENERAL_SOURCE_URL, url)
@ -391,60 +568,60 @@ class GGUFWriter:
self.data_alignment = alignment self.data_alignment = alignment
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment) self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
def add_context_length(self, llm: str, length: int): def add_context_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length) KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
def add_embedding_length(self, llm: str, length: int): def add_embedding_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length) KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_block_count(self, llm: str, length: int): def add_block_count(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_BLOCK_COUNT.format(llm=llm), length) KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
def add_feed_forward_length(self, llm: str, length: int): def add_feed_forward_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length) KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
def add_parallel_residual(self, llm: str, use: bool): def add_parallel_residual(self, use: bool):
self.add_bool( self.add_bool(
KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use) KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tensor_data_layout(self, llm: str, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string( self.add_string(
KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout) KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_head_count(self, llm: str, count: int): def add_head_count(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
def add_head_count_kv(self, llm: str, count: int): def add_head_count_kv(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count) KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
def add_max_alibi_bias(self, llm: str, bias: float): def add_max_alibi_bias(self, bias: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias) KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
def add_clamp_kqv(self, llm: str, value: float): def add_clamp_kqv(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value) KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
def add_layer_norm_eps(self, llm: str, value: float): def add_layer_norm_eps(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
def add_layer_norm_rms_eps(self, llm: str, value: float): def add_layer_norm_rms_eps(self, value: float):
self.add_float32( self.add_float32(
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value) KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
def add_rope_dimension_count(self, llm: str, count: int): def add_rope_dimension_count(self, count: int):
self.add_uint32( self.add_uint32(
KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count) KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_scale(self, llm: str, value: float): def add_rope_scale(self, value: float):
self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value) self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)
@ -476,24 +653,28 @@ class GGUFWriter:
def add_pad_token_id(self, id: int): def add_pad_token_id(self, id: int):
self.add_uint32(KEY_TOKENIZER_PAD_ID, id) self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
# Example usage with a file # Example usage with a file
gguf_writer = GGUFWriter.open("example.gguf") gguf_writer = GGUFWriter("example.gguf", "llama")
gguf_writer.add_architecture("llama") gguf_writer.add_architecture()
gguf_writer.add_block_count(12)
gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer
gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float
gguf_writer.add_custom_alignment(64) gguf_writer.add_custom_alignment(64)
tensor1 = np.ones((32,), dtype=np.float32) * 100.0 tensor1 = np.ones((32,), dtype=np.float32) * 100.0
tensor2 = np.ones((32,), dtype=np.float32) * 101.0 tensor2 = np.ones((64,), dtype=np.float32) * 101.0
gguf_writer.add_tensor_info("tensor0", tensor1) tensor3 = np.ones((96,), dtype=np.float32) * 102.0
gguf_writer.add_tensor_info("tensor1", tensor2)
gguf_writer.add_tensor("tensor1", tensor1)
gguf_writer.add_tensor("tensor2", tensor2)
gguf_writer.add_tensor("tensor3", tensor3)
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
gguf_writer.write_ti_data_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.write_tensor_to_file(tensor1)
gguf_writer.write_tensor_to_file(tensor2)
gguf_writer.close() gguf_writer.close()

826
llama.cpp

File diff suppressed because it is too large Load diff

212
llama.h
View file

@ -61,6 +61,40 @@ extern "C" {
typedef int llama_token; typedef int llama_token;
enum llama_log_level {
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
};
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
@ -75,19 +109,6 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef void (*llama_progress_callback)(float progress, void *ctx);
enum llama_log_level {
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
// Signature for logging events
// Note that text includes the new line character at the end for most events.
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists.
// It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context int32_t n_ctx; // text context
@ -117,28 +138,12 @@ extern "C" {
bool embedding; // embedding mode only bool embedding; // embedding mode only
}; };
// model file types // Signature for logging events
enum llama_ftype { // Note that text includes the new line character at the end for most events.
LLAMA_FTYPE_ALL_F32 = 0, // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors // if it exists.
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors // It might not exist for progress report where '.' is output repeatedly.
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
};
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params {
@ -199,20 +204,14 @@ extern "C" {
LLAMA_API struct llama_context_params llama_context_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
LLAMA_API int llama_max_devices(void);
LLAMA_API bool llama_mmap_supported(void);
LLAMA_API bool llama_mlock_supported(void);
// TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations // If numa is true, use NUMA optimizations
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa); LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void); LLAMA_API void llama_backend_free(void);
LLAMA_API int64_t llama_time_us(void);
LLAMA_API struct llama_model * llama_load_model_from_file( LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params); struct llama_context_params params);
@ -223,10 +222,26 @@ extern "C" {
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
LLAMA_API int64_t llama_time_us(void);
LLAMA_API int llama_max_devices (void);
LLAMA_API bool llama_mmap_supported (void);
LLAMA_API bool llama_mlock_supported(void);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
// Get a string describing the model type
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_model_quantize( LLAMA_API int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
@ -300,11 +315,48 @@ extern "C" {
// IMPORTANT: do not use for anything else other than debugging and testing! // IMPORTANT: do not use for anything else other than debugging and testing!
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
//
// Vocab
//
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
LLAMA_API int llama_model_get_vocab(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
//
// Tokenization
//
// Convert the provided text into tokens. // Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens. // The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens // Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned // Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize( LLAMA_API int llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const char * text, const char * text,
@ -326,39 +378,6 @@ extern "C" {
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
LLAMA_API int llama_get_vocab_from_model(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context // Token Id -> String. Uses the vocabulary in the provided context
// Does not write null terminator to the buffer // Does not write null terminator to the buffer
LLAMA_API int llama_token_to_str( LLAMA_API int llama_token_to_str(
@ -378,13 +397,11 @@ extern "C" {
llama_token token, llama_token token,
char * buf, char * buf,
int length); int length);
// Special tokens
LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
LLAMA_API llama_token llama_token_nl(void); // next-line
//
// Grammar // Grammar
// //
LLAMA_API struct llama_grammar * llama_grammar_init( LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules, const llama_grammar_element ** rules,
size_t n_rules, size_t n_rules,
@ -392,7 +409,9 @@ extern "C" {
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
//
// Sampling functions // Sampling functions
//
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
@ -469,43 +488,16 @@ extern "C" {
} }
#endif #endif
// C++ API, will be moving to common.h soon (TM) // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_CPP #ifdef LLAMA_API_INTERNAL
#include <vector> #include <vector>
#include <string> #include <string>
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::string llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif // LLAMA_API_CPP
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL
#endif // LLAMA_H #endif // LLAMA_H

1
models/.editorconfig Normal file
View file

@ -0,0 +1 @@
root = true

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,3 @@
#!/bin/bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip

View file

@ -2,7 +2,7 @@ function(llama_build_executable source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source})
install(TARGETS ${TEST_TARGET} RUNTIME) install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE llama) target_link_libraries(${TEST_TARGET} PRIVATE llama common)
endfunction() endfunction()
function(llama_test_executable name source) function(llama_test_executable name source)
@ -17,7 +17,7 @@ function(llama_build_and_test_executable source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source})
install(TARGETS ${TEST_TARGET} RUNTIME) install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE llama) target_link_libraries(${TEST_TARGET} PRIVATE llama common)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction() endfunction()
@ -26,10 +26,11 @@ llama_build_and_test_executable(test-quantize-fns.cpp)
llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-quantize-perf.cpp)
llama_build_and_test_executable(test-sampling.cpp) llama_build_and_test_executable(test-sampling.cpp)
llama_build_executable(test-tokenizer-0.cpp) llama_build_executable(test-tokenizer-0.cpp)
llama_test_executable(test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.bin) llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-1.cpp) llama_build_executable(test-tokenizer-1.cpp)
llama_test_executable(test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.bin) llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.bin) #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_build_and_test_executable(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp) llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-grad0.cpp) # SLOW
# llama_build_and_test_executable(test-opt.cpp) # SLOW # llama_build_and_test_executable(test-opt.cpp) # SLOW

View file

@ -3,7 +3,8 @@
#endif #endif
#include "llama.h" #include "llama.h"
#include "examples/grammar-parser.cpp" #include "grammar-parser.h"
#include <cassert> #include <cassert>
int main() int main()

View file

@ -0,0 +1,403 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#include "llama.cpp" // TODO: not great
#include "grammar-parser.h"
#include <cassert>
int main()
{
grammar_parser::parse_state parsed_grammar;
std::vector<std::pair<std::string, uint32_t>> expected = {
{"expr", 2},
{"expr_6", 6},
{"expr_7", 7},
{"ident", 8},
{"ident_10", 10},
{"num", 9},
{"num_11", 11},
{"root", 0},
{"root_1", 1},
{"root_5", 5},
{"term", 4},
{"ws", 3},
{"ws_12", 12},
};
std::vector<std::vector<llama_grammar_element>> expected_rules = {
{{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_CHAR, 10},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
{{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 8},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 9},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 40},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 41},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 45},
{LLAMA_GRETYPE_CHAR_ALT, 43},
{LLAMA_GRETYPE_CHAR_ALT, 42},
{LLAMA_GRETYPE_CHAR_ALT, 47},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_CHAR_ALT, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_CHAR_ALT, 95},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_RULE_REF, 11},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 32},
{LLAMA_GRETYPE_CHAR_ALT, 9},
{LLAMA_GRETYPE_CHAR_ALT, 10},
{LLAMA_GRETYPE_RULE_REF, 12},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
};
for (auto pair : expected)
{
parsed_grammar.symbol_ids[pair.first] = pair.second;
}
for (auto rule : expected_rules)
{
parsed_grammar.rules.push_back({});
for (auto element : rule)
{
parsed_grammar.rules.back().push_back(element);
}
}
llama_grammar *grammar = NULL;
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
}};
auto index = 0;
for (auto stack : grammar->stacks)
{
// compare stack to expected_stack
for (uint32_t i = 0; i < stack.size(); i++)
{
auto element = stack[i];
auto expected_element = expected_stacks[index][i];
// pretty print error message before asserting
if (expected_element.type != element->type || expected_element.value != element->value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
fprintf(stderr, "expected_element != actual_element\n");
}
assert(expected_element.type == element->type && expected_element.value == element->value);
}
index++;
}
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24);
for (size_t i = 0; i < 24; ++i)
{
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
cp[0] = 37 + i;
cp[1] = 0;
next_candidates[i] = {i, cp, {}};
}
std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
};
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
{
rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
all_rejects.push_back(rejects);
}
index = 0;
for (auto rej : all_rejects)
{
for (uint32_t i = 0; i < rej.size(); i++)
{
auto element = rej[i];
auto expected_element = expected_reject[index][i];
assert(element.index == expected_element.first && *element.code_points == expected_element.second);
}
index++;
}
for (auto &candidate : next_candidates)
{
delete[] candidate.code_points;
candidate.code_points = nullptr;
}
delete grammar;
return 0;
}

View file

@ -1,5 +1,5 @@
#define LLAMA_API_CPP // TODO: eliminate me
#include "llama.h" #include "llama.h"
#include "common.h"
#include <cstdio> #include <cstdio>
#include <string> #include <string>
@ -41,7 +41,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
}; };
return _k_tests; return _k_tests;
}; }
int main(int argc, char **argv) { int main(int argc, char **argv) {
if (argc < 2) { if (argc < 2) {
@ -89,6 +89,8 @@ int main(int argc, char **argv) {
return 2; return 2;
} }
bool success = true;
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true); std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
@ -104,6 +106,7 @@ int main(int argc, char **argv) {
if (!correct) { if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__); fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) { for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t); fprintf(stderr, "%6d, ", t);
@ -115,9 +118,7 @@ int main(int argc, char **argv) {
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
llama_free_model(model); success = false;
llama_free(ctx);
return 3;
} }
} }
@ -126,5 +127,5 @@ int main(int argc, char **argv) {
llama_backend_free(); llama_backend_free();
return 0; return success ? 0 : 3;
} }

View file

@ -1,5 +1,5 @@
#define LLAMA_API_CPP // TODO: eliminate me
#include "llama.h" #include "llama.h"
#include "common.h"
#include <cassert> #include <cassert>
#include <cstdio> #include <cstdio>
@ -120,8 +120,7 @@ int main(int argc, char **argv) {
std::string str = u32converter.to_bytes(u32str); std::string str = u32converter.to_bytes(u32str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) { if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
__func__, str.c_str(), tokens[0]);
} }
} }